aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-12-30 11:46:15 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-12-30 11:46:15 +0000
commitdd58ef019b700900793a1eb48b52123db01b654e (patch)
treefcfbb4df56a744f4ddc6122c50521dd3f1c5e196 /lib/Target
parent2fe5752e3a7c345cdb59e869278d36af33c13fa4 (diff)
downloadsrc-dd58ef019b700900793a1eb48b52123db01b654e.tar.gz
src-dd58ef019b700900793a1eb48b52123db01b654e.zip
Vendor import of llvm trunk r256633:
Notes
Notes: svn path=/vendor/llvm/dist/; revision=292915
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.td43
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp15
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp1
-rw-r--r--lib/Target/AArch64/AArch64AddressTypePromotion.cpp8
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp17
-rw-r--r--lib/Target/AArch64/AArch64BranchRelaxation.cpp27
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h38
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td19
-rw-r--r--lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp8
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp32
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp22
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp20
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp16
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp15
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp221
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp93
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h6
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp246
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp1446
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h110
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td1076
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp211
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h12
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td483
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp1304
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp4
-rw-r--r--lib/Target/AArch64/AArch64MachineCombinerPattern.h42
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h17
-rw-r--r--lib/Target/AArch64/AArch64PromoteConstant.cpp4
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp56
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.h5
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp31
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h27
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp2
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp150
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h39
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp201
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp4
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp46
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h10
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h26
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp6
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h8
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp88
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h2
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp33
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h41
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h16
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp126
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp84
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp200
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp26
-rw-r--r--lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h48
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h11
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp479
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp195
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h15
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp20
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h6
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td4
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsics.td4
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp373
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp25
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h26
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp77
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp87
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h51
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp102
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h5
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp43
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp266
-rw-r--r--lib/Target/AMDGPU/CIInstructions.td340
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt7
-rw-r--r--lib/Target/AMDGPU/CaymanInstructions.td4
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td11
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp16
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp27
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp26
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h40
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h3
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp14
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h3
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp11
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp51
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h21
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp6
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp15
-rw-r--r--lib/Target/AMDGPU/Processors.td4
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp4
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp21
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp6
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h6
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td2
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp2
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp18
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h2
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp5
-rw-r--r--lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp6
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp229
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp133
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp204
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp243
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h34
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp660
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h12
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp88
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td44
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp1332
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h154
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td837
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td485
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp36
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp1
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp109
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h232
-rw-r--r--lib/Target/AMDGPU/SIPrepareScratchRegs.cpp193
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp271
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h64
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td117
-rw-r--r--lib/Target/AMDGPU/SISchedule.td18
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp61
-rw-r--r--lib/Target/AMDGPU/SITypeRewriter.cpp10
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp97
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h26
-rw-r--r--lib/Target/AMDGPU/Utils/LLVMBuild.txt2
-rw-r--r--lib/Target/AMDGPU/VIInstructions.td61
-rw-r--r--lib/Target/ARM/ARM.h1
-rw-r--r--lib/Target/ARM/ARM.td708
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp119
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.h9
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp174
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h23
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp17
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h8
-rw-r--r--lib/Target/ARM/ARMCallingConv.h22
-rw-r--r--lib/Target/ARM/ARMCallingConv.td2
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp44
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp3
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.h5
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp64
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp112
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp202
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h8
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp275
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp2101
-rw-r--r--lib/Target/ARM/ARMISelLowering.h49
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp77
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td73
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td440
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td163
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td95
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td55
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp504
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.cpp5
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.h14
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td37
-rw-r--r--lib/Target/ARM/ARMScheduleSwift.td1046
-rw-r--r--lib/Target/ARM/ARMSelectionDAGInfo.cpp56
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp175
-rw-r--r--lib/Target/ARM/ARMSubtarget.h72
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp50
-rw-r--r--lib/Target/ARM/ARMTargetMachine.h3
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp170
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.h33
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp204
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp113
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp3
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.h3
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp389
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h7
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h8
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp54
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h59
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCExpr.h4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp99
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h3
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp32
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp12
-rw-r--r--lib/Target/ARM/README.txt1
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp291
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.h36
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp16
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp4
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp16
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp47
-rw-r--r--lib/Target/AVR/AVR.td563
-rw-r--r--lib/Target/AVR/AVRCallingConv.td65
-rw-r--r--lib/Target/AVR/AVRConfig.h15
-rw-r--r--lib/Target/AVR/AVRMachineFunctionInfo.h73
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.td216
-rw-r--r--lib/Target/AVR/AVRTargetMachine.cpp4
-rw-r--r--lib/Target/AVR/CMakeLists.txt14
-rw-r--r--lib/Target/AVR/LLVMBuild.txt33
-rw-r--r--lib/Target/AVR/Makefile19
-rw-r--r--lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp25
-rw-r--r--lib/Target/AVR/TargetInfo/CMakeLists.txt7
-rw-r--r--lib/Target/AVR/TargetInfo/LLVMBuild.txt23
-rw-r--r--lib/Target/AVR/TargetInfo/Makefile16
-rw-r--r--lib/Target/BPF/BPF.td7
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp3
-rw-r--r--lib/Target/BPF/InstPrinter/BPFInstPrinter.h2
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp25
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp4
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h2
-rw-r--r--lib/Target/CppBackend/CPPBackend.cpp91
-rw-r--r--lib/Target/Hexagon/AsmParser/CMakeLists.txt7
-rw-r--r--lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp2152
-rw-r--r--lib/Target/Hexagon/AsmParser/LLVMBuild.txt23
-rw-r--r--lib/Target/Hexagon/AsmParser/Makefile15
-rw-r--r--lib/Target/Hexagon/BitTracker.cpp12
-rw-r--r--lib/Target/Hexagon/BitTracker.h16
-rw-r--r--lib/Target/Hexagon/CMakeLists.txt10
-rw-r--r--lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp1010
-rw-r--r--lib/Target/Hexagon/Disassembler/LLVMBuild.txt2
-rw-r--r--lib/Target/Hexagon/Hexagon.h7
-rw-r--r--lib/Target/Hexagon/Hexagon.td82
-rw-r--r--lib/Target/Hexagon/HexagonAsmPrinter.cpp435
-rwxr-xr-xlib/Target/Hexagon/HexagonAsmPrinter.h4
-rw-r--r--lib/Target/Hexagon/HexagonBitSimplify.cpp2778
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.cpp33
-rw-r--r--lib/Target/Hexagon/HexagonCFGOptimizer.cpp27
-rw-r--r--lib/Target/Hexagon/HexagonCommonGEP.cpp43
-rw-r--r--lib/Target/Hexagon/HexagonEarlyIfConv.cpp1063
-rw-r--r--lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp435
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.h11
-rw-r--r--lib/Target/Hexagon/HexagonGenExtract.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonGenInsert.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp319
-rw-r--r--lib/Target/Hexagon/HexagonGenPredicate.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonHardwareLoops.cpp16
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.cpp160
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp776
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.h48
-rw-r--r--lib/Target/Hexagon/HexagonInstrAlias.td462
-rw-r--r--lib/Target/Hexagon/HexagonInstrEnc.td1019
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormats.td46
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV4.td2
-rw-r--r--lib/Target/Hexagon/HexagonInstrFormatsV60.td238
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp3980
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h390
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.td65
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfoV4.td130
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfoV5.td10
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfoV60.td2241
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfoVector.td43
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsics.td24
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsicsV60.td836
-rw-r--r--lib/Target/Hexagon/HexagonMCInstLower.cpp60
-rw-r--r--lib/Target/Hexagon/HexagonMachineScheduler.cpp6
-rw-r--r--lib/Target/Hexagon/HexagonNewValueJump.cpp53
-rw-r--r--lib/Target/Hexagon/HexagonOperands.td368
-rw-r--r--lib/Target/Hexagon/HexagonOptimizeSZextends.cpp150
-rw-r--r--lib/Target/Hexagon/HexagonPeephole.cpp4
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp94
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.h2
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.td81
-rw-r--r--lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp91
-rw-r--r--lib/Target/Hexagon/HexagonSchedule.td8
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV4.td5
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV55.td170
-rw-r--r--lib/Target/Hexagon/HexagonScheduleV60.td310
-rw-r--r--lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp50
-rw-r--r--lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonSplitDouble.cpp1209
-rw-r--r--lib/Target/Hexagon/HexagonStoreWidening.cpp616
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.cpp92
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.h20
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp97
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.h15
-rw-r--r--lib/Target/Hexagon/HexagonTargetObjectFile.cpp7
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.cpp38
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.h70
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.cpp1975
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.h114
-rw-r--r--lib/Target/Hexagon/LLVMBuild.txt3
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp52
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h65
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp268
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h119
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h13
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp581
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h218
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp24
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp29
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp126
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp12
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp49
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h35
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp228
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h58
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp52
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h17
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp90
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h52
-rw-r--r--lib/Target/Hexagon/Makefile3
-rw-r--r--lib/Target/LLVMBuild.txt1
-rw-r--r--lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h2
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h13
-rw-r--r--lib/Target/MSP430/MSP430BranchSelector.cpp2
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp17
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.cpp18
-rw-r--r--lib/Target/MSP430/MSP430MCInstLower.cpp8
-rw-r--r--lib/Target/MSP430/MSP430MachineFunctionInfo.cpp2
-rw-r--r--lib/Target/MSP430/MSP430MachineFunctionInfo.h2
-rw-r--r--lib/Target/MSP430/README.txt2
-rw-r--r--lib/Target/Mips/AsmParser/MipsAsmParser.cpp2226
-rw-r--r--lib/Target/Mips/Disassembler/MipsDisassembler.cpp318
-rw-r--r--lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp2
-rw-r--r--lib/Target/Mips/InstPrinter/MipsInstPrinter.h2
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp13
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsABIInfo.h7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp62
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h1
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp17
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp2
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h3
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h5
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h13
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp73
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h25
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCExpr.h4
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp82
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrFormats.td579
-rw-r--r--lib/Target/Mips/MicroMips32r6InstrInfo.td887
-rw-r--r--lib/Target/Mips/MicroMips64r6InstrFormats.td86
-rw-r--r--lib/Target/Mips/MicroMips64r6InstrInfo.td119
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrFormats.td244
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrInfo.td528
-rw-r--r--lib/Target/Mips/MicroMipsInstrFPU.td28
-rw-r--r--lib/Target/Mips/MicroMipsInstrFormats.td81
-rw-r--r--lib/Target/Mips/MicroMipsInstrInfo.td174
-rw-r--r--lib/Target/Mips/Mips.td17
-rw-r--r--lib/Target/Mips/Mips16FrameLowering.cpp8
-rw-r--r--lib/Target/Mips/Mips16HardFloat.cpp202
-rw-r--r--lib/Target/Mips/Mips16ISelDAGToDAG.cpp2
-rw-r--r--lib/Target/Mips/Mips16ISelLowering.cpp9
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.cpp6
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.td120
-rw-r--r--lib/Target/Mips/Mips32r6InstrInfo.td298
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td91
-rw-r--r--lib/Target/Mips/Mips64r6InstrInfo.td12
-rw-r--r--lib/Target/Mips/MipsAsmPrinter.cpp13
-rw-r--r--lib/Target/Mips/MipsCCState.cpp16
-rw-r--r--lib/Target/Mips/MipsCallingConv.td25
-rw-r--r--lib/Target/Mips/MipsConstantIslandPass.cpp31
-rw-r--r--lib/Target/Mips/MipsDSPInstrFormats.td38
-rw-r--r--lib/Target/Mips/MipsDSPInstrInfo.td378
-rw-r--r--lib/Target/Mips/MipsDelaySlotFiller.cpp34
-rw-r--r--lib/Target/Mips/MipsEVAInstrFormats.td84
-rw-r--r--lib/Target/Mips/MipsEVAInstrInfo.td192
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp59
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp125
-rw-r--r--lib/Target/Mips/MipsISelLowering.h37
-rw-r--r--lib/Target/Mips/MipsInstrFPU.td42
-rw-r--r--lib/Target/Mips/MipsInstrFormats.td8
-rw-r--r--lib/Target/Mips/MipsInstrInfo.cpp4
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td510
-rw-r--r--lib/Target/Mips/MipsLongBranch.cpp9
-rw-r--r--lib/Target/Mips/MipsMSAInstrFormats.td24
-rw-r--r--lib/Target/Mips/MipsMSAInstrInfo.td210
-rw-r--r--lib/Target/Mips/MipsMachineFunction.cpp69
-rw-r--r--lib/Target/Mips/MipsMachineFunction.h51
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.cpp59
-rw-r--r--lib/Target/Mips/MipsRegisterInfo.h4
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.cpp241
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.h10
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp2
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp22
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.cpp85
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.h2
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.cpp6
-rw-r--r--lib/Target/Mips/MipsSchedule.td68
-rw-r--r--lib/Target/Mips/MipsScheduleP5600.td392
-rw-r--r--lib/Target/Mips/MipsSubtarget.cpp5
-rw-r--r--lib/Target/Mips/MipsSubtarget.h21
-rw-r--r--lib/Target/Mips/MipsTargetMachine.cpp2
-rw-r--r--lib/Target/Mips/MipsTargetObjectFile.cpp24
-rw-r--r--lib/Target/Mips/MipsTargetObjectFile.h4
-rw-r--r--lib/Target/Mips/MipsTargetStreamer.h19
-rw-r--r--lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h2
-rw-r--r--lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h1
-rw-r--r--lib/Target/NVPTX/NVPTX.h18
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp132
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h15
-rw-r--r--lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp10
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp18
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp59
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp183
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.h17
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp66
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.h1
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp353
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAlloca.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp102
-rw-r--r--lib/Target/NVPTX/NVPTXMCExpr.h4
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXSection.h9
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp60
-rw-r--r--lib/Target/NVPTX/NVPTXTargetObjectFile.h5
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXTargetTransformInfo.h2
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.cpp105
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.h21
-rw-r--r--lib/Target/NVPTX/NVPTXVector.td58
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp4
-rw-r--r--lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp24
-rw-r--r--lib/Target/PowerPC/CMakeLists.txt2
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp2
-rw-r--r--lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h2
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp16
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h25
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h4
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp4
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h8
-rw-r--r--lib/Target/PowerPC/PPC.h3
-rw-r--r--lib/Target/PowerPC/PPC.td14
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp267
-rw-r--r--lib/Target/PowerPC/PPCBoolRetToInt.cpp253
-rw-r--r--lib/Target/PowerPC/PPCBranchSelector.cpp2
-rw-r--r--lib/Target/PowerPC/PPCCTRLoops.cpp26
-rw-r--r--lib/Target/PowerPC/PPCEarlyReturn.cpp35
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp87
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp242
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h31
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp140
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp502
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h22
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td2
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp131
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h53
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td11
-rw-r--r--lib/Target/PowerPC/PPCInstrQPX.td20
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td612
-rw-r--r--lib/Target/PowerPC/PPCLoopDataPrefetch.cpp9
-rw-r--r--lib/Target/PowerPC/PPCLoopPreIncPrep.cpp131
-rw-r--r--lib/Target/PowerPC/PPCMCInstLower.cpp6
-rw-r--r--lib/Target/PowerPC/PPCMIPeephole.cpp230
-rw-r--r--lib/Target/PowerPC/PPCMachineFunctionInfo.cpp4
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp114
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.h15
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.cpp31
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h11
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp52
-rw-r--r--lib/Target/PowerPC/PPCTargetObjectFile.cpp4
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp141
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h32
-rw-r--r--lib/Target/PowerPC/PPCVSXCopy.cpp14
-rw-r--r--lib/Target/PowerPC/PPCVSXFMAMutate.cpp53
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp89
-rw-r--r--lib/Target/Sparc/AsmParser/SparcAsmParser.cpp248
-rw-r--r--lib/Target/Sparc/DelaySlotFiller.cpp2
-rw-r--r--lib/Target/Sparc/Disassembler/SparcDisassembler.cpp52
-rw-r--r--lib/Target/Sparc/InstPrinter/SparcInstPrinter.h3
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h1
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h4
-rw-r--r--lib/Target/Sparc/SparcAsmPrinter.cpp8
-rw-r--r--lib/Target/Sparc/SparcCallingConv.td9
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.cpp154
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.h8
-rw-r--r--lib/Target/Sparc/SparcISelDAGToDAG.cpp183
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp295
-rw-r--r--lib/Target/Sparc/SparcISelLowering.h18
-rw-r--r--lib/Target/Sparc/SparcInstrAliases.td9
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.cpp39
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td146
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.cpp64
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.h4
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.td54
-rw-r--r--lib/Target/Sparc/SparcSubtarget.cpp6
-rw-r--r--lib/Target/Sparc/SparcSubtarget.h3
-rw-r--r--lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp19
-rw-r--r--lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp8
-rw-r--r--lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h1
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp2
-rw-r--r--lib/Target/SystemZ/README.txt6
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZConstantPoolValue.cpp15
-rw-r--r--lib/Target/SystemZ/SystemZConstantPoolValue.h1
-rw-r--r--lib/Target/SystemZ/SystemZElimCompare.cpp86
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.cpp24
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.h3
-rw-r--r--lib/Target/SystemZ/SystemZISelDAGToDAG.cpp24
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp159
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h20
-rw-r--r--lib/Target/SystemZ/SystemZInstrBuilder.h8
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td55
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td1
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp77
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.h4
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td34
-rw-r--r--lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZMachineFunctionInfo.h2
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.cpp4
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.td3
-rw-r--r--lib/Target/SystemZ/SystemZShortenInst.cpp128
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.cpp24
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.h3
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp10
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h11
-rw-r--r--lib/Target/TargetLoweringObjectFile.cpp57
-rw-r--r--lib/Target/TargetMachine.cpp20
-rw-r--r--lib/Target/TargetMachineC.cpp46
-rw-r--r--lib/Target/TargetRecip.cpp12
-rw-r--r--lib/Target/WebAssembly/CMakeLists.txt21
-rw-r--r--lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp94
-rw-r--r--lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h18
-rw-r--r--lib/Target/WebAssembly/LLVMBuild.txt2
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt3
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp103
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp54
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp5
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h4
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp100
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp41
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h16
-rw-r--r--lib/Target/WebAssembly/Makefile10
-rw-r--r--lib/Target/WebAssembly/README.txt68
-rw-r--r--lib/Target/WebAssembly/Relooper.cpp984
-rw-r--r--lib/Target/WebAssembly/Relooper.h186
-rw-r--r--lib/Target/WebAssembly/WebAssembly.h14
-rw-r--r--lib/Target/WebAssembly/WebAssembly.td12
-rw-r--r--lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp110
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp285
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp468
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp81
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp117
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.h3
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISD.def25
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp57
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.cpp602
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.h50
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrCall.td67
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrControl.td82
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrConv.td125
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFloat.td117
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrFormats.td84
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp133
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.h20
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td115
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInteger.td105
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrMemory.td529
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp133
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp106
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.h45
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h68
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp76
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPEI.cpp1066
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPeephole.cpp86
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegColoring.cpp175
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp109
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegStackify.cpp265
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp60
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.h6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.td32
-rw-r--r--lib/Target/WebAssembly/WebAssemblyStoreResults.cpp124
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.cpp1
-rw-r--r--lib/Target/WebAssembly/WebAssemblySubtarget.h8
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp96
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp24
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h43
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp5
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h4
-rw-r--r--lib/Target/WebAssembly/known_gcc_test_failures.txt311
-rw-r--r--lib/Target/X86/AsmParser/CMakeLists.txt3
-rw-r--r--lib/Target/X86/AsmParser/LLVMBuild.txt2
-rw-r--r--lib/Target/X86/AsmParser/Makefile2
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp274
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h10
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp319
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParserCommon.h19
-rw-r--r--lib/Target/X86/CMakeLists.txt11
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp7
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp47
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h2
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp1
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.h3
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp645
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.h2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp83
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h48
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h69
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp184
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp99
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp4
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp199
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h27
-rw-r--r--lib/Target/X86/X86.h59
-rw-r--r--lib/Target/X86/X86.td655
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp25
-rw-r--r--lib/Target/X86/X86AsmPrinter.h2
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp109
-rw-r--r--lib/Target/X86/X86CallingConv.h59
-rw-r--r--lib/Target/X86/X86CallingConv.td79
-rw-r--r--lib/Target/X86/X86CompilationCallback_Win64.asm68
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp21
-rw-r--r--lib/Target/X86/X86FastISel.cpp168
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp89
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp76
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp1162
-rw-r--r--lib/Target/X86/X86FrameLowering.h65
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp505
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp6416
-rw-r--r--lib/Target/X86/X86ISelLowering.h124
-rw-r--r--lib/Target/X86/X86InstrAVX512.td3696
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td69
-rw-r--r--lib/Target/X86/X86InstrBuilder.h7
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td2
-rw-r--r--lib/Target/X86/X86InstrCompiler.td216
-rw-r--r--lib/Target/X86/X86InstrControl.td13
-rw-r--r--lib/Target/X86/X86InstrFMA.td252
-rw-r--r--lib/Target/X86/X86InstrFPStack.td136
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td336
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp1514
-rw-r--r--lib/Target/X86/X86InstrInfo.h142
-rw-r--r--lib/Target/X86/X86InstrInfo.td257
-rw-r--r--lib/Target/X86/X86InstrMMX.td24
-rw-r--r--lib/Target/X86/X86InstrSSE.td761
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td140
-rw-r--r--lib/Target/X86/X86InstrSystem.td91
-rw-r--r--lib/Target/X86/X86InstrXOP.td115
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h980
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp216
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.cpp2
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h8
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp326
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp5
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp318
-rw-r--r--lib/Target/X86/X86RegisterInfo.h25
-rw-r--r--lib/Target/X86/X86RegisterInfo.td41
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp43
-rw-r--r--lib/Target/X86/X86Subtarget.cpp46
-rw-r--r--lib/Target/X86/X86Subtarget.h57
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp17
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp8
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h2
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp879
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h64
-rw-r--r--lib/Target/X86/X86WinEHState.cpp234
-rw-r--r--lib/Target/XCore/Disassembler/XCoreDisassembler.cpp2
-rw-r--r--lib/Target/XCore/InstPrinter/XCoreInstPrinter.h2
-rw-r--r--lib/Target/XCore/XCoreAsmPrinter.cpp18
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.cpp38
-rw-r--r--lib/Target/XCore/XCoreISelDAGToDAG.cpp5
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp26
-rw-r--r--lib/Target/XCore/XCoreISelLowering.h14
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.cpp18
-rw-r--r--lib/Target/XCore/XCoreLowerThreadLocal.cpp9
-rw-r--r--lib/Target/XCore/XCoreMachineFunctionInfo.cpp2
-rw-r--r--lib/Target/XCore/XCoreMachineFunctionInfo.h2
-rw-r--r--lib/Target/XCore/XCoreTargetMachine.cpp2
-rw-r--r--lib/Target/XCore/XCoreTargetObjectFile.cpp14
-rw-r--r--lib/Target/XCore/XCoreTargetObjectFile.h2
-rw-r--r--lib/Target/XCore/XCoreTargetTransformInfo.h2
680 files changed, 74334 insertions, 24223 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 9a7d6c884db5..0bff9b592c15 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+ "Enable ARMv8 PMUv3 Performance Monitors extension">;
+
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+ "Full FP16", [FeatureFPARMv8]>;
+
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+ "Enable Statistical Profiling extension">;
+
/// Cyclone has register move instructions which are "free".
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
@@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions">;
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+ "StrictAlign", "true",
+ "Disallow all unaligned memory "
+ "access">;
+
+def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
+ "Reserve X18, making it unavailable "
+ "as a GPR">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
"Support ARM v8.1a instructions", [FeatureCRC]>;
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+ "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -70,19 +91,29 @@ include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
+def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+ "Cortex-A35 ARM processors",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC,
+ FeaturePerfMon]>;
+
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
"Cyclone",
@@ -90,12 +121,16 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
FeatureNEON,
FeatureCrypto,
FeatureCRC,
+ FeaturePerfMon,
FeatureZCRegMove, FeatureZCZeroing]>;
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
FeatureNEON,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
@@ -109,11 +144,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def GenericAsmParserVariant : AsmParserVariant {
int Variant = 0;
string Name = "generic";
+ string BreakCharacters = ".";
}
def AppleAsmParserVariant : AsmParserVariant {
int Variant = 1;
string Name = "apple-neon";
+ string BreakCharacters = ".";
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d7ef3f4ef653..d215d9e831c0 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
// Get the previous machine basic block in the function.
- MachineFunction::iterator MBBI = *MBB;
+ MachineFunction::iterator MBBI(MBB);
// Can't go off top of function.
if (MBBI == MBB->getParent()->begin())
@@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 2> Cond;
- MachineBasicBlock *PrevBB = std::prev(MBBI);
+ MachineBasicBlock *PrevBB = &*std::prev(MBBI);
for (MachineBasicBlock *S : MBB->predecessors())
if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
!TBB && !FBB)
@@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
// If there is no non-pseudo in the current block, loop back around and try
// the previous block (if there is one).
while ((FMBB = getBBFallenThrough(FMBB, TII))) {
- for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) {
- if (!I->isPseudo())
- return &*I;
- }
+ for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+ if (!I.isPseudo())
+ return &I;
}
// There was no previous non-pseudo in the fallen through blocks
@@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
++Idx;
}
- DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
- << " occurences of pattern found.\n");
+ DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+ << " occurrences of pattern found.\n");
// Then update the basic block, inserting nops between the detected sequences.
for (auto &MI : Sequences) {
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 9d6dbd641a16..79a84ad8c6c5 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
if (Change) {
Substs[MO.getReg()] = Reg;
MO.setReg(Reg);
- MRI->setPhysRegUsed(Reg);
Changed = true;
}
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 716e1a37b1f7..3afcdfb8b930 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
" the other."),
cl::init(true));
+#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
+
//===----------------------------------------------------------------------===//
// AArch64AddressTypePromotion
//===----------------------------------------------------------------------===//
@@ -76,7 +78,7 @@ public:
}
const char *getPassName() const override {
- return "AArch64 Address Type Promotion";
+ return AARCH64_TYPE_PROMO_NAME;
}
/// Iterate over the functions and promote the computation of interesting
@@ -143,10 +145,10 @@ private:
char AArch64AddressTypePromotion::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
- "AArch64 Type Promotion Pass", false, false)
+ AARCH64_TYPE_PROMO_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
- "AArch64 Type Promotion Pass", false, false)
+ AARCH64_TYPE_PROMO_NAME, false, false)
FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
return new AArch64AddressTypePromotion();
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 18d21fd38618..1644d71d2821 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+namespace llvm {
+void initializeAArch64AdvSIMDScalarPass(PassRegistry &);
+}
+
+#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
+
namespace {
class AArch64AdvSIMDScalar : public MachineFunctionPass {
MachineRegisterInfo *MRI;
@@ -82,12 +88,14 @@ private:
public:
static char ID; // Pass identification, replacement for typeid.
- explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+ explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
+ initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &F) override;
const char *getPassName() const override {
- return "AdvSIMD Scalar Operation Optimization";
+ return AARCH64_ADVSIMD_NAME;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -98,6 +106,9 @@ public:
char AArch64AdvSIMDScalar::ID = 0;
} // end anonymous namespace
+INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
+ AARCH64_ADVSIMD_NAME, false, false)
+
static bool isGPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
if (SubReg)
@@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
// Just check things on a one-block-at-a-time basis.
for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
- if (processMachineBasicBlock(I))
+ if (processMachineBasicBlock(&*I))
Changed = true;
return Changed;
}
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index d973234dd86a..a614f555a4e9 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
STATISTIC(NumSplit, "Number of basic blocks split");
STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+namespace llvm {
+void initializeAArch64BranchRelaxationPass(PassRegistry &);
+}
+
+#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass"
+
namespace {
class AArch64BranchRelaxation : public MachineFunctionPass {
/// BasicBlockInfo - Information about the offset and size of a single
@@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass {
public:
static char ID;
- AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+ AArch64BranchRelaxation() : MachineFunctionPass(ID) {
+ initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "AArch64 branch relaxation pass";
+ return AARCH64_BR_RELAX_NAME;
}
};
char AArch64BranchRelaxation::ID = 0;
}
+INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax",
+ AARCH64_BR_RELAX_NAME, false, false)
+
/// verify - check BBOffsets, BBSizes, alignment of islands
void AArch64BranchRelaxation::verify() {
#ifndef NDEBUG
@@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() {
/// into the block immediately after it.
static bool BBHasFallthrough(MachineBasicBlock *MBB) {
// Get the next machine basic block in the function.
- MachineFunction::iterator MBBI = MBB;
+ MachineFunction::iterator MBBI(MBB);
// Can't fall off end of function.
- MachineBasicBlock *NextBB = std::next(MBBI);
+ auto NextBB = std::next(MBBI);
if (NextBB == MBB->getParent()->end())
return false;
for (MachineBasicBlock *S : MBB->successors())
- if (S == NextBB)
+ if (S == &*NextBB)
return true;
return false;
@@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
- MachineFunction::iterator MBBI = OrigBB;
- ++MBBI;
- MF->insert(MBBI, NewBB);
+ MF->insert(++OrigBB->getIterator(), NewBB);
// Splice the instructions starting with MI over to NewBB.
NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
@@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
MBB->replaceSuccessor(FBB, NewBB);
NewBB->addSuccessor(FBB);
}
- MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+ MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB));
DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
<< ", invert condition and change dest. to BB#"
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 1e2d1c3b93bd..bc44bc5f2461 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,30 +25,28 @@
namespace {
using namespace llvm;
-static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
- AArch64::X3, AArch64::X4, AArch64::X5,
- AArch64::X6, AArch64::X7};
-static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
- AArch64::H3, AArch64::H4, AArch64::H5,
- AArch64::H6, AArch64::H7};
-static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
- AArch64::S3, AArch64::S4, AArch64::S5,
- AArch64::S6, AArch64::S7};
-static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
- AArch64::D3, AArch64::D4, AArch64::D5,
- AArch64::D6, AArch64::D7};
-static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
- AArch64::Q3, AArch64::Q4, AArch64::Q5,
- AArch64::Q6, AArch64::Q7};
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+ AArch64::H3, AArch64::H4, AArch64::H5,
+ AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+ AArch64::S3, AArch64::S4, AArch64::S5,
+ AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+ AArch64::D3, AArch64::D4, AArch64::D5,
+ AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+ AArch64::Q3, AArch64::Q4, AArch64::Q5,
+ AArch64::Q6, AArch64::Q7};
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
CCState &State, unsigned SlotAlign) {
unsigned Size = LocVT.getSizeInBits() / 8;
- unsigned StackAlign = State.getMachineFunction()
- .getTarget()
- .getDataLayout()
- ->getStackAlignment();
+ unsigned StackAlign =
+ State.getMachineFunction().getDataLayout().getStackAlignment();
unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
for (auto &It : PendingMembers) {
@@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
- ArrayRef<uint16_t> RegList;
+ ArrayRef<MCPhysReg> RegList;
if (LocVT.SimpleTy == MVT::i64)
RegList = XRegList;
else if (LocVT.SimpleTy == MVT::f16)
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 815ebef177d8..388d64ec4e99 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
/// CCIfBigEndian - Match only if we're in big endian mode.
class CCIfBigEndian<CCAction A> :
- CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>;
+ CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
//===----------------------------------------------------------------------===//
// ARM AAPCS64 Calling Convention
@@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin
FP,
(sequence "Q%u", 0, 31))>;
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+ : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+ (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_AArch64_CXX_TLS_Darwin_PE
+ : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
+ : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+
// The ELF stub used for TLS-descriptor access saves every feasible
// register. Only X0 and LR are clobbered.
def CSR_AArch64_TLS_ELF
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 06ff9af37fd7..9310ac4a44a2 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
*TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
// Insert a copy from X0 to TLSBaseAddrReg for later.
- MachineInstr *Next = I->getNextNode();
- MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
- TII->get(TargetOpcode::COPY),
- *TLSBaseAddrReg).addReg(AArch64::X0);
+ MachineInstr *Copy =
+ BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(AArch64::X0);
return Copy;
}
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index efdb2e33a36e..78c239b11ef3 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -168,6 +168,8 @@ namespace llvm {
void initializeAArch64CollectLOHPass(PassRegistry &);
}
+#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
+
namespace {
struct AArch64CollectLOH : public MachineFunctionPass {
static char ID;
@@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "AArch64 Collect Linker Optimization Hint (LOH)";
+ return AARCH64_COLLECT_LOH_NAME;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg;
char AArch64CollectLOH::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
- "AArch64 Collect Linker Optimization Hint (LOH)", false,
- false)
+ AARCH64_COLLECT_LOH_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
- "AArch64 Collect Linker Optimization Hint (LOH)", false,
- false)
+ AARCH64_COLLECT_LOH_NAME, false, false)
/// Given a couple (MBB, reg) get the corresponding set of instruction from
/// the given "sets".
@@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF,
for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
- assert(ItRegId != RegToId.end() &&
- "Sub-register of an "
- "involved register, not recorded as involved!");
+ // If this alias has not been recorded, then it is not interesting
+ // for the current analysis.
+ // We can end up in this situation because of tuple registers.
+ // E.g., Let say we are interested in S1. When we register
+ // S1, we will also register its aliases and in particular
+ // the tuple Q1_Q2.
+ // Now, when we encounter Q1_Q2, we will look through its aliases
+ // and will find that S2 is not registered.
+ if (ItRegId == RegToId.end())
+ continue;
+
BBKillSet.set(ItRegId->second);
BBGen[ItRegId->second] = &MI;
}
@@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) {
switch (Instr->getOpcode()) {
default:
return false;
+ case AArch64::STRBBui:
+ case AArch64::STRHHui:
case AArch64::STRBui:
case AArch64::STRHui:
case AArch64::STRWui:
@@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
// If the chain is three instructions long and ldr is the second element,
// then this ldr must load form GOT, otherwise this is not a correct chain.
- if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+ if (L2 && !IsL2Add &&
+ !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
continue;
SmallVector<const MachineInstr *, 3> Args;
MCLOHType Kind;
@@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
DEBUG(dbgs() << "** Collect Involved Register\n");
for (const auto &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- if (!canDefBePartOfLOH(&MI))
+ if (!canDefBePartOfLOH(&MI) &&
+ !isCandidateLoad(&MI) && !isCandidateStore(&MI))
continue;
// Process defs
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index b9e41c61defe..fc27bfee73d1 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -59,6 +59,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
case AArch64::SUBSXri:
// cmn is an alias for adds with a dead destination register.
case AArch64::ADDSWri:
- case AArch64::ADDSXri:
- if (MRI->use_empty(I->getOperand(0).getReg()))
- return I;
-
- DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
- return nullptr;
-
+ case AArch64::ADDSXri: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
+ if (!I->getOperand(2).isImm()) {
+ DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+ return nullptr;
+ } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+ DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+ return nullptr;
+ } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
+ DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ return nullptr;
+ }
+ return I;
+ }
// Prevent false positive case like:
// cmp w19, #0
// cinc w0, w19, gt
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2b0c92fe02d5..df1320fbd4c9 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
MIOperands::PhysRegInfo PRI =
MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
- if (PRI.Reads) {
+ if (PRI.Read) {
// The ccmp doesn't produce exactly the same flags as the original
// compare, so reject the transform if there are uses of the flags
// besides the terminators.
@@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
return nullptr;
}
- if (PRI.Clobbers) {
+ if (PRI.Defined || PRI.Clobbered) {
DEBUG(dbgs() << "Not convertible compare: " << *I);
++NumUnknNZCVDefs;
return nullptr;
@@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
// All CmpBB instructions are moved into Head, and CmpBB is deleted.
// Update the CFG first.
updateTailPHIs();
- Head->removeSuccessor(CmpBB);
- CmpBB->removeSuccessor(Tail);
+ Head->removeSuccessor(CmpBB, true);
+ CmpBB->removeSuccessor(Tail, true);
Head->transferSuccessorsAndUpdatePHIs(CmpBB);
DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
TII->RemoveBranch(*Head);
@@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree(
// convert() removes CmpBB which was previously dominated by Head.
// CmpBB children should be transferred to Head.
MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
- for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
- MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+ for (MachineBasicBlock *RemovedMBB : Removed) {
+ MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB);
assert(Node != HeadNode && "Cannot erase the head node");
assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
while (Node->getNumChildren())
DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
- DomTree->eraseNode(Removed[i]);
+ DomTree->eraseNode(RemovedMBB);
}
}
@@ -801,8 +801,8 @@ void
AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
if (!Loops)
return;
- for (unsigned i = 0, e = Removed.size(); i != e; ++i)
- Loops->removeBlock(Removed[i]);
+ for (MachineBasicBlock *RemovedMBB : Removed)
+ Loops->removeBlock(RemovedMBB);
}
/// Invalidate MachineTraceMetrics before if-conversion.
@@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
Loops = getAnalysisIfAvailable<MachineLoopInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
- MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ MinSize = MF.getFunction()->optForMinSize();
bool Changed = false;
CmpConv.runOnMachineFunction(MF);
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 74fc167433f6..576cf4a74167 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -26,6 +26,12 @@ using namespace llvm;
STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+namespace llvm {
+void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &);
+}
+
+#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions"
+
namespace {
class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
private:
@@ -35,11 +41,14 @@ private:
bool usesFrameIndex(const MachineInstr &MI);
public:
static char ID; // Pass identification, replacement for typeid.
- explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+ explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {
+ initializeAArch64DeadRegisterDefinitionsPass(
+ *PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &F) override;
- const char *getPassName() const override { return "Dead register definitions"; }
+ const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -49,6 +58,9 @@ public:
char AArch64DeadRegisterDefinitions::ID = 0;
} // end anonymous namespace
+INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs",
+ AARCH64_DEAD_REG_DEF_NAME, false, false)
+
bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
unsigned Reg, const MachineInstr &MI) {
for (const MachineOperand &MO : MI.implicit_operands())
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c2470f747a38..d24e42a93763 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -22,18 +22,26 @@
#include "llvm/Support/MathExtras.h"
using namespace llvm;
+namespace llvm {
+void initializeAArch64ExpandPseudoPass(PassRegistry &);
+}
+
+#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
+
namespace {
class AArch64ExpandPseudo : public MachineFunctionPass {
public:
static char ID;
- AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+ AArch64ExpandPseudo() : MachineFunctionPass(ID) {
+ initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
+ }
const AArch64InstrInfo *TII;
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "AArch64 pseudo instruction expansion pass";
+ return AARCH64_EXPAND_PSEUDO_NAME;
}
private:
@@ -45,6 +53,9 @@ private:
char AArch64ExpandPseudo::ID = 0;
}
+INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
+ AARCH64_EXPAND_PSEUDO_NAME, false, false)
+
/// \brief Transfer implicit operands on the pseudo instruction to the
/// instructions created from the expansion.
static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 072819836bb3..0ac4b39b0357 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
U = C;
}
- if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
if (Ty->getAddressSpace() > 255)
// Fast instruction selection doesn't support the special
// address spaces.
@@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
// Cannot encode an offset register and an immediate offset in the same
// instruction. Fold the immediate offset into the load/store instruction and
- // emit an additonal add to take care of the offset register.
+ // emit an additional add to take care of the offset register.
if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
RegisterOffsetNeedsLowering = true;
@@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr,
// FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size
// and alignment should be based on the VT.
MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
// Now add the rest of the operands.
MIB.addFrameIndex(FI).addImm(Offset);
} else {
@@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
}
// Check if the mul can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (isMulPowOf2(RHS)) {
const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(MulLHS);
- return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
- AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
}
+ }
// Check if the shift can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
@@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftType, ShiftVal, SetFlags,
- WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
}
}
}
+ }
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
@@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
static const unsigned OpcTable[2][2][2] = {
{ { AArch64::SUBWrs, AArch64::SUBXrs },
{ AArch64::ADDWrs, AArch64::ADDXrs } },
@@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
+ if (ShiftImm >= 4)
+ return 0;
+
static const unsigned OpcTable[2][2][2] = {
{ { AArch64::SUBWrx, AArch64::SUBXrx },
{ AArch64::ADDWrx, AArch64::ADDXrx } },
@@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
return ResultReg;
// Check if the mul can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (isMulPowOf2(RHS)) {
const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(MulLHS);
- return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
}
+ }
// Check if the shift can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (const auto *SI = dyn_cast<ShlOperator>(RHS))
if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
uint64_t ShiftVal = C->getZExtValue();
@@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
}
+ }
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
@@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
{ AArch64::ORRWrs, AArch64::ORRXrs },
{ AArch64::EORWrs, AArch64::EORXrs }
};
+
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
const TargetRegisterClass *RC;
unsigned Opc;
switch (RetVT.SimpleTy) {
@@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
MIB.addImm(TestBit);
MIB.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
- fastEmitBranch(FBB, DbgLoc);
-
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
- AArch64CC::CondCode CC = AArch64CC::NE;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
if (CI->hasOneUse() && isValueAvailable(CI)) {
// Try to optimize or fold the cmp.
@@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
// FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
// instruction.
- CC = getCompareCC(Predicate);
+ AArch64CC::CondCode CC = getCompareCC(Predicate);
AArch64CC::CondCode ExtraCC = AArch64CC::AL;
switch (Predicate) {
default:
@@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
.addImm(CC)
.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
- return true;
- }
- } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
- MVT SrcVT;
- if (TI->hasOneUse() && isValueAvailable(TI) &&
- isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
- unsigned CondReg = getRegForValue(TI->getOperand(0));
- if (!CondReg)
- return false;
- bool CondIsKill = hasTrivialKill(TI->getOperand(0));
-
- // Issue an extract_subreg to get the lower 32-bits.
- if (SrcVT == MVT::i64) {
- CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
- AArch64::sub_32);
- CondIsKill = true;
- }
-
- unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
- assert(ANDReg && "Unexpected AND instruction emission failure.");
- emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
- if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
- std::swap(TBB, FBB);
- CC = AArch64CC::EQ;
- }
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
- .addMBB(TBB);
-
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
} else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
@@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
.addMBB(Target);
- // Obtain the branch weight and add the target to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- Target->getBasicBlock());
- FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+ // Obtain the branch probability and add the target to the successor list.
+ if (FuncInfo.BPI) {
+ auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+ BI->getParent(), Target->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(Target, BranchProbability);
+ } else
+ FuncInfo.MBB->addSuccessorWithoutProb(Target);
return true;
- } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
- // Fake request the condition, otherwise the intrinsic might be completely
- // optimized away.
- unsigned CondReg = getRegForValue(BI->getCondition());
- if (!CondReg)
- return false;
-
- // Emit the branch.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
- .addMBB(TBB);
+ } else {
+ AArch64CC::CondCode CC = AArch64CC::NE;
+ if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (!CondReg)
+ return false;
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
- fastEmitBranch(FBB, DbgLoc);
- return true;
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
}
unsigned CondReg = getRegForValue(BI->getCondition());
@@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
return false;
bool CondRegIsKill = hasTrivialKill(BI->getCondition());
- // We've been divorced from our compare! Our block was split, and
- // now our compare lives in a predecessor block. We musn't
- // re-compare here, as the children of the compare aren't guaranteed
- // live across the block boundary (we *could* check for this).
- // Regardless, the compare has been done in the predecessor block,
- // and it left a value for us in a virtual register. Ergo, we test
- // the one-bit value left in the virtual register.
- emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
-
+ // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+ unsigned Opcode = AArch64::TBNZW;
if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
std::swap(TBB, FBB);
- CC = AArch64CC::EQ;
+ Opcode = AArch64::TBZW;
}
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
+ const MCInstrDesc &II = TII.get(Opcode);
+ unsigned ConstrainedCondReg
+ = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+ .addImm(0)
.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
// Make sure the CFG is up-to-date.
- for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
- FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+ for (auto *Succ : BI->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
return true;
}
@@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
bool AArch64FastISel::selectCmp(const Instruction *I) {
const CmpInst *CI = cast<CmpInst>(I);
+ // Vectors of i1 are weird: bail out.
+ if (CI->getType()->isVectorTy())
+ return false;
+
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
unsigned ResultReg = 0;
@@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
.addImm(NumBytes);
// Process the args.
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
+ for (CCValAssign &VA : ArgLocs) {
const Value *ArgVal = CLI.OutVals[VA.getValNo()];
MVT ArgVT = OutVTs[VA.getValNo()];
@@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getStack(Addr.getOffset()),
- MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
if (!emitStore(ArgVT, ArgReg, Addr, MMO))
return false;
@@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
return false;
// Make sure nothing is in the way
- BasicBlock::const_iterator Start = I;
- BasicBlock::const_iterator End = II;
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
for (auto Itr = std::prev(Start); Itr != End; --Itr) {
// We only expect extractvalue instructions between the intrinsic and the
// instruction to be selected.
@@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (F.isVarArg())
return false;
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
// Build a list of return value registers.
SmallVector<unsigned, 4> RetRegs;
@@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::RET_ReallyLR));
- for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
- MIB.addReg(RetRegs[i], RegState::Implicit);
+ for (unsigned RetReg : RetRegs)
+ MIB.addReg(RetReg, RegState::Implicit);
return true;
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index a76473f7e539..11ae8005370d 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -72,9 +72,9 @@
//
// For most functions, some of the frame areas are empty. For those functions,
// it may not be necessary to set up fp or bp:
-// * A base pointer is definitly needed when there are both VLAs and local
+// * A base pointer is definitely needed when there are both VLAs and local
// variables with more-than-default alignment requirements.
-// * A frame pointer is definitly needed when there are local variables with
+// * A frame pointer is definitely needed when there are local variables with
// more-than-default alignment requirements.
//
// In some cases when a base pointer is not strictly needed, it is generated
@@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
if (CSI.empty())
return;
- const DataLayout *TD = MF.getTarget().getDataLayout();
+ const DataLayout &TD = MF.getDataLayout();
bool HasFP = hasFP(MF);
// Calculate amount of bytes used for return address storing.
- int stackGrowth = -TD->getPointerSize(0);
+ int stackGrowth = -TD.getPointerSize(0);
// Calculate offsets.
int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
@@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
const MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *Fn = MF.getFunction();
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
bool HasFP = hasFP(MF);
- DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
@@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (NumBytes && NeedsRealignment) {
// Use the first callee-saved register as a scratch register.
scratchSPReg = AArch64::X9;
- MF.getRegInfo().setPhysRegUsed(scratchSPReg);
}
// If we're a leaf function, try using the red zone.
@@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (needsFrameMoves) {
- const DataLayout *TD = MF.getTarget().getDataLayout();
- const int StackGrowth = -TD->getPointerSize(0);
+ const DataLayout &TD = MF.getDataLayout();
+ const int StackGrowth = -TD.getPointerSize(0);
unsigned FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
//
@@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
return false;
}
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+/// Checks whether the given instruction restores callee save registers
+/// and if so returns how many.
+static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
unsigned RtIdx = 0;
- if (MI->getOpcode() == AArch64::LDPXpost ||
- MI->getOpcode() == AArch64::LDPDpost)
+ switch (MI.getOpcode()) {
+ case AArch64::LDPXpost:
+ case AArch64::LDPDpost:
RtIdx = 1;
-
- if (MI->getOpcode() == AArch64::LDPXpost ||
- MI->getOpcode() == AArch64::LDPDpost ||
- MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
- if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
- !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
- MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
- return false;
- return true;
+ // FALLTHROUGH
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
+ !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
+ MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
+ return 0;
+ return 2;
}
-
- return false;
+ return 0;
}
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool IsTailCallReturn = false;
if (MBB.end() != MBBI) {
@@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
- // | (NumRestores * 16) | | |
+ // | (NumRestores * 8) | | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
@@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
- if (LastPopI != MBB.begin()) {
- do {
- ++NumRestores;
- --LastPopI;
- } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
- if (!isCSRestore(LastPopI, CSRegs)) {
+ MachineBasicBlock::iterator Begin = MBB.begin();
+ while (LastPopI != Begin) {
+ --LastPopI;
+ unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
+ NumRestores += Restores;
+ if (Restores == 0) {
++LastPopI;
- --NumRestores;
+ break;
}
}
- NumBytes -= NumRestores * 16;
+ NumBytes -= NumRestores * 8;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
if (!hasFP(MF)) {
@@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// be able to save any instructions.
if (NumBytes || MFI->hasVarSizedObjects())
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
- unsigned FrameReg;
- return getFrameIndexReference(MF, FI, FrameReg);
+ -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
DebugLoc DL;
assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- if (MI != MBB.end())
- DL = MI->getDebugLoc();
-
for (unsigned i = 0; i < Count; i += 2) {
unsigned idx = Count - i - 2;
unsigned Reg1 = CSI[idx].getReg();
@@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned NumFPRSpilled = 0;
bool ExtraCSSpill = false;
bool CanEliminateFrame = true;
- DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+ DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
// Check pairs of consecutive callee-saved registers.
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 731f031ff855..427afdf4acbf 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -37,7 +37,6 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
int resolveFrameIndexReference(const MachineFunction &MF, int FI,
@@ -61,6 +60,11 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 772e894f4f0a..6c868880bcac 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -34,7 +34,6 @@ using namespace llvm;
namespace {
class AArch64DAGToDAGISel : public SelectionDAGISel {
- AArch64TargetMachine &TM;
/// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
@@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
public:
explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
ForCodeSize(false) {}
const char *getPassName() const override {
@@ -53,9 +52,7 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override {
- ForCodeSize =
- MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
- MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ ForCodeSize = MF.getFunction()->optForSize();
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -79,6 +76,21 @@ public:
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, true, Reg, Shift);
}
+ bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+ }
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
}
@@ -153,8 +165,7 @@ public:
SDNode *SelectBitfieldExtractOp(SDNode *N);
SDNode *SelectBitfieldInsertOp(SDNode *N);
-
- SDNode *SelectLIBM(SDNode *N);
+ SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
SDNode *SelectReadRegister(SDNode *N);
SDNode *SelectWriteRegister(SDNode *N);
@@ -165,6 +176,8 @@ public:
private:
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
+ bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
return true;
}
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
// high lane extract.
static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
SDValue &LaneOp, int &LaneIdx) {
@@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
}
// AArch64 mandates that the RHS of the operation must use the smallest
- // register classs that could contain the size being extended from. Thus,
+ // register class that could contain the size being extended from. Thus,
// if we're folding a (sext i8), we need the RHS to be a GPR32, even though
// there might not be an actual 32-bit value in the program. We can
// (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
@@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
/// need to create a real ADD instruction from it anyway and there's no point in
/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
-/// leads to duplaicated ADRP instructions.
+/// leads to duplicated ADRP instructions.
static bool isWorthFoldingADDlow(SDValue N) {
for (auto Use : N->uses()) {
if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
@@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) {
return true;
}
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ SDLoc dl(N);
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering *TLI = getTargetLowering();
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+ }
+
+ // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+ // selected here doesn't support labels/immediates, only base+offset.
+
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+ RHSC < (0x40 << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
+ }
+ }
+ }
+
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // stp x1, x2, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+}
+
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
@@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
if (isa<ConstantSDNode>(RHS)) {
int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
unsigned Scale = Log2_32(Size);
- // Skip the immediate can be seleced by load/store addressing mode.
+ // Skip the immediate can be selected by load/store addressing mode.
// Also skip the immediate can be encoded by a single ADD (SUB is also
// checked by using -ImmOff).
if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
@@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
// it into an i64.
DstVT = MVT::i32;
}
+ } else if (VT == MVT::f16) {
+ Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::f32) {
Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
} else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
SDValue SuperReg = SDValue(Ld, 0);
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
- static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
- AArch64::qsub3 };
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
if (Narrow)
@@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
} else {
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
- static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
- AArch64::qsub3 };
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
SuperReg);
@@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// The resulting code will be at least as good as the original one
// plus it may expose more opportunities for bitfield insert pattern.
// FIXME: Currently we limit this to the bigger pattern, because
- // some optimizations expect AND and not UBFM
+ // some optimizations expect AND and not UBFM.
Opd0 = N->getOperand(0);
} else
return false;
@@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
/// Does this tree qualify as an attempt to move a bitfield into position,
/// essentially "(and (shl VAL, N), Mask)".
static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+ bool BiggerPattern,
SDValue &Src, int &ShiftAmount,
int &MaskWidth) {
EVT VT = Op.getValueType();
@@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
Op = Op.getOperand(0);
}
+ // Don't match if the SHL has more than one use, since then we'll end up
+ // generating SHL+UBFIZ instead of just keeping SHL+AND.
+ if (!BiggerPattern && !Op.hasOneUse())
+ return false;
+
uint64_t ShlImm;
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
return false;
@@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
// BFI encompasses sufficiently many nodes that it's worth inserting an extra
// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
- // amount.
+ // amount. BiggerPattern is true when this pattern is being matched for BFI,
+ // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+ // which case it is not profitable to insert an extra shift.
+ if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+ return false;
Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
return true;
@@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
SDValue &Src, unsigned &ImmR,
- unsigned &ImmS, SelectionDAG *CurDAG) {
+ unsigned &ImmS, const APInt &UsefulBits,
+ SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
// Set Opc
@@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// Because of simplify-demanded-bits in DAGCombine, involved masks may not
// have the expected shape. Try to undo that.
- APInt UsefulBits;
- getUsefulBits(SDValue(N, 0), UsefulBits);
unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
- // OR is commutative, check both possibilities (does llvm provide a
- // way to do that directely, e.g., via code matcher?)
- SDValue OrOpd1Val = N->getOperand(1);
- SDNode *OrOpd0 = N->getOperand(0).getNode();
- SDNode *OrOpd1 = N->getOperand(1).getNode();
- for (int i = 0; i < 2;
- ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+ // OR is commutative, check all combinations of operand order and values of
+ // BiggerPattern, i.e.
+ // Opd0, Opd1, BiggerPattern=false
+ // Opd1, Opd0, BiggerPattern=false
+ // Opd0, Opd1, BiggerPattern=true
+ // Opd1, Opd0, BiggerPattern=true
+ // Several of these combinations may match, so check with BiggerPattern=false
+ // first since that will produce better results by matching more instructions
+ // and/or inserting fewer extra instructions.
+ for (int I = 0; I < 4; ++I) {
+
+ bool BiggerPattern = I / 2;
+ SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+ SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+ SDNode *OrOpd1 = OrOpd1Val.getNode();
+
unsigned BFXOpc;
int DstLSB, Width;
if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
- NumberOfIgnoredLowBits, true)) {
+ NumberOfIgnoredLowBits, BiggerPattern)) {
// Check that the returned opcode is compatible with the pattern,
// i.e., same type and zero extended (U and not S)
if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
@@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// If the mask on the insertee is correct, we have a BFXIL operation. We
// can share the ImmR and ImmS values from the already-computed UBFM.
- } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
- DstLSB, Width)) {
+ } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+ BiggerPattern,
+ Src, DstLSB, Width)) {
ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
ImmS = Width - 1;
} else
@@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
unsigned Opc;
unsigned LSB, MSB;
SDValue Opd0, Opd1;
+ EVT VT = N->getValueType(0);
+ APInt NUsefulBits;
+ getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+ // If all bits are not useful, just return UNDEF.
+ if (!NUsefulBits)
+ return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
- if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+ if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+ CurDAG))
return nullptr;
- EVT VT = N->getValueType(0);
SDLoc dl(N);
SDValue Ops[] = { Opd0,
Opd1,
@@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
}
-SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+ if (N->getOpcode() != ISD::AND)
+ return nullptr;
+
EVT VT = N->getValueType(0);
- unsigned Variant;
unsigned Opc;
- unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
-
- if (VT == MVT::f32) {
- Variant = 0;
- } else if (VT == MVT::f64) {
- Variant = 1;
- } else
- return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
- // Pick the FRINTX variant needed to set the flags.
- unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
- switch (N->getOpcode()) {
- default:
- return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
- case ISD::FCEIL: {
- unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
- Opc = FRINTPOpcs[Variant];
- break;
- }
- case ISD::FFLOOR: {
- unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
- Opc = FRINTMOpcs[Variant];
- break;
- }
- case ISD::FTRUNC: {
- unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
- Opc = FRINTZOpcs[Variant];
- break;
- }
- case ISD::FROUND: {
- unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
- Opc = FRINTAOpcs[Variant];
- break;
- }
- }
+ if (VT == MVT::i32)
+ Opc = AArch64::UBFMWri;
+ else if (VT == MVT::i64)
+ Opc = AArch64::UBFMXri;
+ else
+ return nullptr;
- SDLoc dl(N);
- SDValue In = N->getOperand(0);
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(In);
+ SDValue Op0;
+ int DstLSB, Width;
+ if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+ Op0, DstLSB, Width))
+ return nullptr;
- if (!TM.Options.UnsafeFPMath) {
- SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
- Ops.push_back(SDValue(FRINTX, 1));
- }
+ // ImmR is the rotate right amount.
+ unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+ // ImmS is the most significant bit of the source to be moved.
+ unsigned ImmS = Width - 1;
- return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+ SDLoc DL(N);
+ SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
}
bool
@@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
// into a single value to be used in the MRS/MSR instruction.
static int getIntOperandFromRegisterString(StringRef RegString) {
SmallVector<StringRef, 5> Fields;
- RegString.split(Fields, ":");
+ RegString.split(Fields, ':');
if (Fields.size() == 1)
return -1;
@@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
assert (isa<ConstantSDNode>(N->getOperand(2))
&& "Expected a constant integer expression.");
uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
- return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other,
+ unsigned State;
+ if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+ assert(Immed < 2 && "Bad imm");
+ State = AArch64::MSRpstateImm1;
+ } else {
+ assert(Immed < 16 && "Bad imm");
+ State = AArch64::MSRpstateImm4;
+ }
+ return CurDAG->getMachineNode(State, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
CurDAG->getTargetConstant(Immed, DL, MVT::i16),
N->getOperand(0));
@@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case ISD::SRA:
if (SDNode *I = SelectBitfieldExtractOp(Node))
return I;
+ if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
+ return I;
break;
case ISD::OR:
@@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
}
}
+ break;
}
case AArch64ISD::LD2post: {
if (VT == MVT::v8i8)
@@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
break;
}
-
- case ISD::FCEIL:
- case ISD::FFLOOR:
- case ISD::FTRUNC:
- case ISD::FROUND:
- if (SDNode *I = SelectLIBM(Node))
- return I;
- break;
}
// Select the default instruction
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3e8f46cf1ecd..9f5beff12100 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,23 +40,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-namespace {
-enum AlignMode {
- StrictAlign,
- NoStrictAlign
-};
-}
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
- cl::Hidden, cl::init(NoStrictAlign),
- cl::values(
- clEnumValN(StrictAlign, "aarch64-strict-align",
- "Disallow all unaligned memory accesses"),
- clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
- "Allow unaligned memory accesses"),
- clEnumValEnd));
-
// Place holder until extr generation is tested fully.
static cl::opt<bool>
EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
@@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
- // Exception handling.
- // FIXME: These are guesses. Has this been defined yet?
- setExceptionPointerRegister(AArch64::X0);
- setExceptionSelectorRegister(AArch64::X1);
-
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ }
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
@@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ }
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
// v4f16 is also a storage-only type, so promote it to v4f32 when that is
// known to be safe.
@@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FMINNUM, Ty, Legal);
+ setOperationAction(ISD::FMAXNUM, Ty, Legal);
+ setOperationAction(ISD::FMINNAN, Ty, Legal);
+ setOperationAction(ISD::FMAXNAN, Ty, Legal);
}
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
+ // This requires the Performance Monitors extension.
+ if (Subtarget->hasPerfMon())
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
if (Subtarget->isTargetMachO()) {
// For iOS, we don't want to the normal expansion of a libcall to
// sincos. We want to issue a libcall to __sincos_stret to avoid memory
@@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedLoadAction(im, MVT::i64, Legal);
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
+ setIndexedLoadAction(im, MVT::f16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i64, Legal);
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
+ setIndexedStoreAction(im, MVT::f16, Legal);
}
// Trap.
@@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
+ setTargetDAGCombine(ISD::FP_TO_UINT);
+ setTargetDAGCombine(ISD::FDIV);
+
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::ANY_EXTEND);
@@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
+ if (Subtarget->supportsAddressTopByteIgnored())
+ setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
- setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setMinFunctionAlignment(2);
- RequireStrictAlign = (Align == StrictAlign);
-
setHasExtractBitsInsn(true);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
@@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+
+ // But we do support custom-lowering for FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
@@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
+ if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
+ for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
+ ISD::FMINNUM, ISD::FMAXNUM})
+ setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
break;
}
case ISD::INTRINSIC_W_CHAIN: {
- ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
@@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
return MVT::i64;
}
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *Fast) const {
+ if (Subtarget->requiresStrictAlign())
+ return false;
+
+ // FIXME: This is mostly true for Cyclone, but not necessarily others.
+ if (Fast) {
+ // FIXME: Define an attribute for slow unaligned accesses instead of
+ // relying on the CPU type as a proxy.
+ // On Cyclone, unaligned 128-bit stores are slow.
+ *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+ // See comments in performSTORECombine() for more details about
+ // these conditions.
+
+ // Code that uses clang vector extensions can mark that it
+ // wants unaligned accesses to be treated as fast by
+ // underspecifying alignment to be 1 or 2.
+ Align <= 2 ||
+
+ // Disregard v2i64. Memcpy lowering produces those and splitting
+ // them regresses performance on micro-benchmarks and olden/bh.
+ VT == MVT::v2i64;
+ }
+ return true;
+}
+
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
@@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
+ case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
+ case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
+ case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
- case AArch64ISD::FMIN: return "AArch64ISD::FMIN";
- case AArch64ISD::FMAX: return "AArch64ISD::FMAX";
case AArch64ISD::DUP: return "AArch64ISD::DUP";
case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
@@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI->getDebugLoc();
- MachineFunction::iterator It = MBB;
- ++It;
+ MachineFunction::iterator It = ++MBB->getIterator();
unsigned DestReg = MI->getOperand(0).getReg();
unsigned IfTrueReg = MI->getOperand(1).getReg();
@@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
- if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
- cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+ if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
// the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
@@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// the absence of information about op2.
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
- } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+ } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
!isUnsignedIntSetCC(CC)) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
@@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = LHS.getOperand(0);
}
- return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+ return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
.getValue(1);
}
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+/// cmp A
+/// ccmp B, inv(CB), CA
+/// check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+/// (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+/// cmp C
+/// ccmp D, inv(CD), CC
+/// ccmp A, CA, inv(CD)
+/// ccmp B, CB, inv(CA)
+/// check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC, SDValue CCOp,
+ SDValue Condition, unsigned NZCV,
+ SDLoc DL, SelectionDAG &DAG) {
+ unsigned Opcode = 0;
+ if (LHS.getValueType().isFloatingPoint())
+ Opcode = AArch64ISD::FCCMP;
+ else if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubOp0 = RHS.getOperand(0);
+ if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
+ }
+ if (Opcode == 0)
+ Opcode = AArch64ISD::CCMP;
+
+ SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+ return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+ unsigned Depth = 0) {
+ if (!Val.hasOneUse())
+ return false;
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ CanPushNegate = true;
+ return true;
+ }
+ // Protect against stack overflow.
+ if (Depth > 15)
+ return false;
+ if (Opcode == ISD::AND || Opcode == ISD::OR) {
+ SDValue O0 = Val->getOperand(0);
+ SDValue O1 = Val->getOperand(1);
+ bool CanPushNegateL;
+ if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+ return false;
+ bool CanPushNegateR;
+ if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+ return false;
+ // We cannot push a negate through an AND operation (it would become an OR),
+ // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
+ // push the negate through the x/y subtrees.
+ CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+ return true;
+ }
+ return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC, bool PushNegate = false,
+ SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
+ unsigned Depth = 0) {
+ // We're at a tree leaf, produce a conditional comparison operation.
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+ bool isInteger = LHS.getValueType().isInteger();
+ if (PushNegate)
+ CC = getSetCCInverse(CC, isInteger);
+ SDLoc DL(Val);
+ // Determine OutCC and handle FP special case.
+ if (isInteger) {
+ OutCC = changeIntCCToAArch64CC(CC);
+ } else {
+ assert(LHS.getValueType().isFloatingPoint());
+ AArch64CC::CondCode ExtraCC;
+ changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
+ // Surpisingly some floating point conditions can't be tested with a
+ // single condition code. Construct an additional comparison in this case.
+ // See comment below on how we deal with OR conditions.
+ if (ExtraCC != AArch64CC::AL) {
+ SDValue ExtraCmp;
+ if (!CCOp.getNode())
+ ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+ else {
+ SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+ // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
+ NZCV, DL, DAG);
+ }
+ CCOp = ExtraCmp;
+ Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ }
+ }
+
+ // Produce a normal comparison if we are first in the chain
+ if (!CCOp.getNode())
+ return emitComparison(LHS, RHS, CC, DL, DAG);
+ // Otherwise produce a ccmp.
+ SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+ return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+ DAG);
+ } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
+ return SDValue();
+
+ assert((Opcode == ISD::OR || !PushNegate)
+ && "Can only push negate through OR operation");
+
+ // Check if both sides can be transformed.
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+ bool CanPushNegateL;
+ if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
+ return SDValue();
+ bool CanPushNegateR;
+ if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
+ return SDValue();
+
+ // Do we need to negate our operands?
+ bool NegateOperands = Opcode == ISD::OR;
+ // We can negate the results of all previous operations by inverting the
+ // predicate flags giving us a free negation for one side. For the other side
+ // we need to be able to push the negation to the leafs of the tree.
+ if (NegateOperands) {
+ if (!CanPushNegateL && !CanPushNegateR)
+ return SDValue();
+ // Order the side where we can push the negate through to LHS.
+ if (!CanPushNegateL && CanPushNegateR)
+ std::swap(LHS, RHS);
+ } else {
+ bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
+ bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
+ if (NeedsNegOutL && NeedsNegOutR)
+ return SDValue();
+ // Order the side where we need to negate the output flags to RHS so it
+ // gets emitted first.
+ if (NeedsNegOutL)
+ std::swap(LHS, RHS);
+ }
+
+ // Emit RHS. If we want to negate the tree we only need to push a negate
+ // through if we are already in a PushNegate case, otherwise we can negate
+ // the "flags to test" afterwards.
+ AArch64CC::CondCode RHSCC;
+ SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
+ CCOp, Predicate, Depth+1);
+ if (NegateOperands && !PushNegate)
+ RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+ // Emit LHS. We must push the negate through if we need to negate it.
+ SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
+ CmpR, RHSCC, Depth+1);
+ // If we transformed an OR to and AND then we have to negate the result
+ // (or absorb a PushNegate resulting in a double negation).
+ if (Opcode == ISD::OR && !PushNegate)
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ return CmpL;
+}
+
+/// @}
+
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
- SDValue Cmp;
- AArch64CC::CondCode AArch64CC;
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
@@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
}
- // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
- // For the i8 operand, the largest immediate is 255, so this can be easily
- // encoded in the compare instruction. For the i16 operand, however, the
- // largest immediate cannot be encoded in the compare.
- // Therefore, use a sign extending load and cmn to avoid materializing the -1
- // constant. For example,
- // movz w1, #65535
- // ldrh w0, [x0, #0]
- // cmp w0, w1
- // >
- // ldrsh w0, [x0, #0]
- // cmn w0, #1
- // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
- // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
- // both the LHS and RHS are truely zero extended and to make sure the
- // transformation is profitable.
+ SDValue Cmp;
+ AArch64CC::CondCode AArch64CC;
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
- if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
- isa<LoadSDNode>(LHS)) {
- if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
- cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
- LHS.getNode()->hasNUsesOfValue(1, 0)) {
- int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
- if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
- SDValue SExt =
- DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
- DAG.getValueType(MVT::i16));
- Cmp = emitComparison(SExt,
- DAG.getConstant(ValueofRHS, dl,
- RHS.getValueType()),
- CC, dl, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
- AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
- return Cmp;
- }
+ const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+ // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+ // For the i8 operand, the largest immediate is 255, so this can be easily
+ // encoded in the compare instruction. For the i16 operand, however, the
+ // largest immediate cannot be encoded in the compare.
+ // Therefore, use a sign extending load and cmn to avoid materializing the
+ // -1 constant. For example,
+ // movz w1, #65535
+ // ldrh w0, [x0, #0]
+ // cmp w0, w1
+ // >
+ // ldrsh w0, [x0, #0]
+ // cmn w0, #1
+ // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+ // if and only if (sext LHS) == (sext RHS). The checks are in place to
+ // ensure both the LHS and RHS are truly zero extended and to make sure the
+ // transformation is profitable.
+ if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+ cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+ LHS.getNode()->hasNUsesOfValue(1, 0)) {
+ int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+ if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+ SDValue SExt =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+ DAG.getValueType(MVT::i16));
+ Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+ RHS.getValueType()),
+ CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
}
}
+
+ if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+ if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+ if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+ AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+ }
+ }
+ }
+
+ if (!Cmp) {
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
}
- Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
- AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
+ AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
return Cmp;
}
@@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
- SDLoc(Op)).first;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
- /*isSigned*/ false, SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
}
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
+ unsigned NumElts = InVT.getVectorNumElements();
+
+ // f16 vectors are promoted to f32 before a conversion.
+ if (InVT.getVectorElementType() == MVT::f16) {
+ MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+ SDLoc dl(Op);
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+ }
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
SDLoc dl(Op);
@@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
- SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
+SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::aarch64_thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
+ }
+ case Intrinsic::aarch64_neon_smax:
+ return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umax:
+ return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_smin:
+ return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umin:
+ return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFSINCOS(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return LowerINTRINSIC_WO_CHAIN(Op, DAG);
}
}
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
- return 2;
-}
-
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
break;
}
- ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- MemVT, false, false, false, 0);
+ ArgValue = DAG.getExtLoad(
+ ExtType, DL, VA.getLocVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ MemVT, false, false, false, 0);
InVals.push_back(ArgValue);
}
@@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
- SDValue Store =
- DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 8), false, false, 0);
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
+ false, 0);
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
- SDValue Store =
- DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 16), false, false, 0);
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
+ false, false, 0);
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
@@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
*DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
- if (!ArgLocs[i].isRegLoc())
+ for (const CCValAssign &ArgLoc : ArgLocs)
+ if (!ArgLoc.isRegLoc())
return false;
}
@@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
- DstInfo = MachinePointerInfo::getFixedStack(FI);
+ DstInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
@@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
- DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+ DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
+ LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
@@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
}
@@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add argument registers to the end of the list so that they are known live
// into the call.
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
- Ops.push_back(DAG.getRegister(RegsToPass[i].first,
- RegsToPass[i].second.getValueType()));
+ for (auto &RegToPass : RegsToPass)
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
@@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AArch64::GPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
RetOps[0] = Chain; // Update chain.
@@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
- SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(),
- /*isVolatile=*/ false,
- /*isNonTemporal=*/ true,
- /*isInvariant=*/ true, 8);
+ SDValue GlobalAddr = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /*isVolatile=*/false,
+ /*isNonTemporal=*/true,
+ /*isInvariant=*/true, 8);
if (GN->getOffset() != 0)
return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
DAG.getConstant(GN->getOffset(), DL, PtrVT));
@@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet =
- DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
- false, true, true, 8);
+ DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
+ true, true, 8);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
if (Model == TLSModel::LocalDynamic)
Model = TLSModel::GeneralDynamic;
@@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
unsigned Opc = LHS.getOpcode();
- if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->isOne() &&
+ if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
@@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
- if (SrcVT != VT) {
- if (SrcVT == MVT::f32 && VT == MVT::f64)
- In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
- else if (SrcVT == MVT::f64 && VT == MVT::f32)
- In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2,
- DAG.getIntPtrConstant(0, DL));
- else
- // FIXME: Src type is different, bail out for now. Can VT really be a
- // vector type?
- return SDValue();
- }
+
+ if (SrcVT.bitsLT(VT))
+ In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+ else if (SrcVT.bitsGT(VT))
+ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
EVT VecVT;
EVT EltVT;
@@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SDValue VecVal1, VecVal2;
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
EltVT = MVT::i32;
- VecVT = MVT::v4i32;
+ VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
EltMask = 0x80000000ULL;
if (!VT.isVector()) {
@@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
- if (Cmp == Result)
- return (Cmp.getValueType() == MVT::f32 ||
- Cmp.getValueType() == MVT::f64);
-
- ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
- ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
- if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
- Result.getValueType() == MVT::f64) {
- bool Lossy;
- APFloat CmpVal = CCmp->getValueAPF();
- CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
- return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
- }
-
- return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
SDValue FVal, SDLoc dl,
@@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
}
}
- // Handle integers first.
+ // Also handle f16, for which we need to do a f32 comparison.
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ }
+
+ // Next, handle integers.
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
@@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
- ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
- if (CVal && CVal->isAllOnesValue()) {
+ if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
@@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
// that we can match with a CSNEG rather than a CSEL.
- ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
- if (CVal && CVal->isNullValue()) {
+ if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
@@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
- SDValue ARMcc;
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ HiBitsForLo =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ HiBitsForLo, CCVal, Cmp);
+
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETGE, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue LoForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
- SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
- SDValue Lo =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
// AArch64 shifts larger than the register width are wrapped rather than
// clamped, so we can't just emit "hi >> x".
- SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue TrueValHi = Opc == ISD::SRA
- ? DAG.getNode(Opc, dl, VT, ShOpHi,
- DAG.getConstant(VTBits - 1, dl,
- MVT::i64))
- : DAG.getConstant(0, dl, VT);
- SDValue Hi =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+ SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForBigShift =
+ Opc == ISD::SRA
+ ? DAG.getNode(Opc, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i64))
+ : DAG.getConstant(0, dl, VT);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
+
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
@@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
- SDValue ARMcc;
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ LoBitsForHi =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ LoBitsForHi, CCVal, Cmp);
+
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+ SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
- SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETGE, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
- SDValue Hi =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
// AArch64 shifts of larger than register sizes are wrapped rather than
// clamped, so we can't just emit "lo << a" if a is too big.
- SDValue TrueValLo = DAG.getConstant(0, dl, VT);
- SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Lo =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+ SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+ SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
@@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
// Validate and return a target constant for them if we can.
case 'z': {
// 'z' maps to xzr or wzr so it needs an input of 0.
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
- if (!C || C->getZExtValue() != 0)
+ if (!isNullConstant(Op))
return;
if (Op.getValueType() == MVT::i64)
@@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op,
return Op;
SmallVector<SDValue, 16> Ops;
- for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
- SDValue Lane = Op.getOperand(I);
- if (Lane.getOpcode() == ISD::Constant) {
+ for (SDValue Lane : Op->ops()) {
+ if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
APInt LowBits(EltTy.getSizeInBits(),
- cast<ConstantSDNode>(Lane)->getZExtValue());
+ CstLane->getZExtValue());
Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
}
Ops.push_back(Lane);
@@ -5997,8 +6307,7 @@ FailedModImm:
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
- SDValue shuffle = ReconstructShuffle(Op, DAG);
- if (shuffle != SDValue())
+ if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
}
@@ -6017,7 +6326,10 @@ FailedModImm:
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
// value is already in an S or D register.
- if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+ // Do not do this for UNDEF/LOAD nodes because we have better patterns
+ // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
+ if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+ (ElemSize == 32 || ElemSize == 64)) {
unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
MachineSDNode *N =
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
@@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
unsigned Val = Cst->getZExtValue();
unsigned Size = Op.getValueType().getSizeInBits();
- if (Val == 0) {
- switch (Size) {
- case 8:
- return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 16:
- return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 32:
- return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 64:
- return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
- Op.getOperand(0));
- default:
- llvm_unreachable("Unexpected vector type in extract_subvector!");
- }
- }
+
+ // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+ if (Val == 0)
+ return Op;
+
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
@@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
}
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation. For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-/// 1 <= |Value| <= ElementBits for a right shift; or
-/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
- int64_t &Cnt) {
+/// operand of a vector shift right operation. The value must be in the range:
+/// 1 <= Value <= ElementBits for a right shift; or
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
- if (isIntrinsic)
- Cnt = -Cnt;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
@@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
case ISD::SRA:
case ISD::SRL:
// Right shift immediate
- if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
- Cnt < EltSize) {
+ if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
@@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_neon_ld4r: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
- uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
@@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
- NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VecTy = Shuffles[0]->getType();
- unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
- // Skip illegal vector types.
- if (VecSize != 64 && VecSize != 128)
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
return false;
// A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
const DataLayout &DL = SI->getModule()->getDataLayout();
- unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
- // Skip illegal vector types.
- if (SubVecSize != 64 && SubVecSize != 128)
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
return false;
Value *Op0 = SVI->getOperand(0);
@@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
- SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
- if (Res != SDValue())
+ if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;
EVT VT = N->getValueType(0);
@@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
// If the result of an integer load is only used by an integer-to-float
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
- // This eliminates an "integer-to-vector-move UOP and improve throughput.
+ // This eliminates an "integer-to-vector-move" UOP and improves throughput.
SDValue N0 = N->getOperand(0);
if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
@@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+ return SDValue();
+
+ SDValue ConstVec = Op->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t Bits = IntBits == 64 ? 64 : 32;
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+ if (C == -1 || C == 0 || C > Bits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = MVT::v4i32;
+ break;
+ }
+
+ SDLoc DL(N);
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+ : Intrinsic::aarch64_neon_vcvtfp2fxu;
+ SDValue FixConv =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+ Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+ // We can handle smaller integers by generating an extra trunc.
+ if (IntBits < FloatBits)
+ FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+ return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ unsigned Opc = Op->getOpcode();
+ if (!Op.getValueType().isVector() ||
+ (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+ return SDValue();
+
+ SDValue ConstVec = N->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+ int32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+ int32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+ if (C == -1 || C == 0 || C > FloatBits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = MVT::v4i32;
+ break;
+ }
+
+ SDLoc DL(N);
+ SDValue ConvInput = Op.getOperand(0);
+ bool IsSigned = Opc == ISD::SINT_TO_FP;
+ if (IntBits < FloatBits)
+ ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ ResTy, ConvInput);
+
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+ : Intrinsic::aarch64_neon_vcvtfxu2fp;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+ DAG.getConstant(C, DL, MVT::i32));
+}
+
/// An EXTR instruction is made up of two shifts, ORed together. This helper
/// searches for and classifies those shifts.
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
@@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
- break;
case Intrinsic::aarch64_neon_saddv:
return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
case Intrinsic::aarch64_neon_uaddv:
@@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
- return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmin:
- return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fmaxnm:
+ return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fminnm:
+ return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
@@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
unsigned Alignment = std::min(OrigAlignment, EltOffset);
// Create scalar stores. This is at least as good as the code sequence for a
- // split unaligned store wich is a dup.s, ext.b, and two stores.
+ // split unaligned store which is a dup.s, ext.b, and two stores.
// Most of the time the three stores should be replaced by store pair
// instructions (stp).
SDLoc DL(St);
@@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
return NewST1;
}
-static SDValue performSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
+static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N,
if (S->isVolatile())
return SDValue();
+ // FIXME: The logic for deciding if an unaligned store should be split should
+ // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+ // a call to that function here.
+
// Cyclone has bad performance on unaligned 16B stores when crossing line and
// page boundaries. We want to split such stores.
if (!Subtarget->isCyclone())
return SDValue();
- // Don't split at Oz.
- MachineFunction &MF = DAG.getMachineFunction();
- bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
- if (IsMinSize)
+ // Don't split at -Oz.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
SDValue StVal = S->getValue();
@@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N,
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
- SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
- if (ReplacedSplat != SDValue())
+ if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S))
return ReplacedSplat;
SDLoc DL(S);
@@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N,
return SDValue();
}
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return true;
+ }
+ return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
+ if (Split.getNode())
+ return Split;
+
+ if (Subtarget->supportsAddressTopByteIgnored() &&
+ performTBISimplification(N->getOperand(2), DCI, DAG))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+ /// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+ unsigned Op,
+ SelectionDAG &DAG) {
+ EVT VTy = OpV->getOperand(0).getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ int NumVecElts = VTy.getVectorNumElements();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (NumVecElts != 4)
+ return SDValue();
+ } else {
+ if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+ return SDValue();
+ }
+
+ int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+ SDValue PreOp = OpV;
+ // Iterate over each step of the across vector reduction.
+ for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+ SDValue CurOp = PreOp.getOperand(0);
+ SDValue Shuffle = PreOp.getOperand(1);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+ // Try to swap the 1st and 2nd operand as add and min/max instructions
+ // are commutative.
+ CurOp = PreOp.getOperand(1);
+ Shuffle = PreOp.getOperand(0);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+ }
+
+ // Check if the input vector is fed by the operator we want to handle,
+ // except the last step; the very first input vector is not necessarily
+ // the same operator we are handling.
+ if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+ return SDValue();
+
+ // Check if it forms one step of the across vector reduction.
+ // E.g.,
+ // %cur = add %1, %0
+ // %shuffle = vector_shuffle %cur, <2, 3, u, u>
+ // %pre = add %cur, %shuffle
+ if (Shuffle.getOperand(0) != CurOp)
+ return SDValue();
+
+ int NumMaskElts = 1 << CurStep;
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+ // Check mask values in each step.
+ // We expect the shuffle mask in each step follows a specific pattern
+ // denoted here by the <M, U> form, where M is a sequence of integers
+ // starting from NumMaskElts, increasing by 1, and the number integers
+ // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+ // of undef in U should be NumVecElts - NumMaskElts.
+ // E.g., for <8 x i16>, mask values in each step should be :
+ // step 0 : <1,u,u,u,u,u,u,u>
+ // step 1 : <2,3,u,u,u,u,u,u>
+ // step 2 : <4,5,6,7,u,u,u,u>
+ for (int i = 0; i < NumVecElts; ++i)
+ if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+ (i >= NumMaskElts && !(Mask[i] < 0)))
+ return SDValue();
+
+ PreOp = CurOp;
+ }
+ unsigned Opcode;
+ bool IsIntrinsic = false;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unexpected operator for across vector reduction");
+ case ISD::ADD:
+ Opcode = AArch64ISD::UADDV;
+ break;
+ case ISD::SMAX:
+ Opcode = AArch64ISD::SMAXV;
+ break;
+ case ISD::UMAX:
+ Opcode = AArch64ISD::UMAXV;
+ break;
+ case ISD::SMIN:
+ Opcode = AArch64ISD::SMINV;
+ break;
+ case ISD::UMIN:
+ Opcode = AArch64ISD::UMINV;
+ break;
+ case ISD::FMAXNUM:
+ Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+ IsIntrinsic = true;
+ break;
+ case ISD::FMINNUM:
+ Opcode = Intrinsic::aarch64_neon_fminnmv;
+ IsIntrinsic = true;
+ break;
+ }
+ SDLoc DL(N);
+
+ return IsIntrinsic
+ ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+ DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+ : DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+ DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+/// %smax0 = smax %arr, svn0
+/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax1 = smax %smax0, %svn1
+/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax2 = smax %smax1, svn2
+/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %sc = setcc %smax2, %svn3, gt
+/// %n0 = extract_vector_elt %sc, #0
+/// %n1 = extract_vector_elt %smax2, #0
+/// %n2 = extract_vector_elt $smax2, #1
+/// %result = select %n0, %n1, n2
+/// becomes :
+/// %1 = smaxv %0
+/// %result = extract_vector_elt %1, 0
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue IfTrue = N->getOperand(1);
+ SDValue IfFalse = N->getOperand(2);
+
+ // Check if the SELECT merges up the final result of the min/max
+ // from a vector.
+ if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Expect N0 is fed by SETCC.
+ SDValue SetCC = N0.getOperand(0);
+ EVT SetCCVT = SetCC.getValueType();
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+ SetCCVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue VectorOp = SetCC.getOperand(0);
+ unsigned Op = VectorOp->getOpcode();
+ // Check if the input vector is fed by the operator we want to handle.
+ if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+ Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
+ return SDValue();
+
+ EVT VTy = VectorOp.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (EltTy != MVT::f32)
+ return SDValue();
+ } else {
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+ }
+
+ // Check if extracting from the same vector.
+ // For example,
+ // %sc = setcc %vector, %svn1, gt
+ // %n0 = extract_vector_elt %sc, #0
+ // %n1 = extract_vector_elt %vector, #0
+ // %n2 = extract_vector_elt $vector, #1
+ if (!(VectorOp == IfTrue->getOperand(0) &&
+ VectorOp == IfFalse->getOperand(0)))
+ return SDValue();
+
+ // Check if the condition code is matched with the operator type.
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+ (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+ (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+ (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+ (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+ CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+ CC != ISD::SETGE) ||
+ (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+ CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+ CC != ISD::SETLE))
+ return SDValue();
+
+ // Expect to check only lane 0 from the vector SETCC.
+ if (!isNullConstant(N0.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the true value from lane 0.
+ if (!isNullConstant(IfTrue.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the false value from lane 1.
+ if (!isOneConstant(IfFalse.getOperand(1)))
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+/// %1 = vector_shuffle %0, <2,3,u,u>
+/// %2 = add %0, %1
+/// %3 = vector_shuffle %2, <1,u,u,u>
+/// %4 = add %2, %3
+/// %result = extract_vector_elt %4, 0
+/// becomes :
+/// %0 = uaddv %0
+/// %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check if the input vector is fed by the ADD.
+ if (N0->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The vector extract idx must constant zero because we only expect the final
+ // result of the reduction is placed in lane 0.
+ if (!isNullConstant(N1))
+ return SDValue();
+
+ EVT VTy = N0.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
+}
+
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N,
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return SDValue();
- if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+ if (isNullConstant(LHS))
std::swap(LHS, RHS);
- if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+ if (!isNullConstant(RHS))
return SDValue();
if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
@@ -8868,75 +9585,6 @@ static SDValue performSelectCombine(SDNode *N,
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
-/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC
-/// to match FMIN/FMAX patterns.
-static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) {
- // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y".
- // Unless the NoNaNsFPMath option is set, be careful about NaNs:
- // vmax/vmin return NaN if either operand is a NaN;
- // only do the transformation when it matches that behavior.
-
- SDValue CondLHS = N->getOperand(0);
- SDValue CondRHS = N->getOperand(1);
- SDValue LHS = N->getOperand(2);
- SDValue RHS = N->getOperand(3);
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
- unsigned Opcode;
- bool IsReversed;
- if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) &&
- selectCCOpsAreFMaxCompatible(CondRHS, RHS)) {
- IsReversed = false; // x CC y ? x : y
- } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) &&
- selectCCOpsAreFMaxCompatible(CondLHS, RHS)) {
- IsReversed = true ; // x CC y ? y : x
- } else {
- return SDValue();
- }
-
- bool IsUnordered = false, IsOrEqual;
- switch (CC) {
- default:
- return SDValue();
- case ISD::SETULT:
- case ISD::SETULE:
- IsUnordered = true;
- case ISD::SETOLT:
- case ISD::SETOLE:
- case ISD::SETLT:
- case ISD::SETLE:
- IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE);
- Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN;
- break;
-
- case ISD::SETUGT:
- case ISD::SETUGE:
- IsUnordered = true;
- case ISD::SETOGT:
- case ISD::SETOGE:
- case ISD::SETGT:
- case ISD::SETGE:
- IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE);
- Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX;
- break;
- }
-
- // If LHS is NaN, an ordered comparison will be false and the result will be
- // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking
- // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
- if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
- return SDValue();
-
- // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true,
- // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be
- // used for unsafe math or if one of the operands is known to be nonzero.
- if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
- return SDValue();
-
- return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
-}
-
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -8961,6 +9609,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return performFpToIntCombine(N, DAG, Subtarget);
+ case ISD::FDIV:
+ return performFDivCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
@@ -8973,12 +9626,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::SELECT:
- return performSelectCombine(N, DCI);
+ case ISD::SELECT: {
+ SDValue RV = performSelectCombine(N, DCI);
+ if (!RV.getNode())
+ RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+ return RV;
+ }
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
- case ISD::SELECT_CC:
- return performSelectCCCombine(N, DCI.DAG);
+ case ISD::LOAD:
+ if (performTBISimplification(N->getOperand(1), DCI, DAG))
+ return SDValue(N, 0);
+ break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
@@ -8991,6 +9650,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9157,6 +9818,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
}
+static void ReplaceReductionResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG, unsigned InterOp,
+ unsigned AcrossOp) {
+ EVT LoVT, HiVT;
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+ SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+ Results.push_back(SplitVal);
+}
+
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -9165,6 +9840,24 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
+ case AArch64ISD::SADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+ return;
+ case AArch64ISD::UADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+ return;
+ case AArch64ISD::SMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+ return;
+ case AArch64ISD::UMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+ return;
+ case AArch64ISD::SMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+ return;
+ case AArch64ISD::UMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+ return;
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -9177,10 +9870,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const {
return true;
}
-bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are three or more FDIVs.
- return NumUsers > 2;
+ return 3;
}
TargetLoweringBase::LegalizeTypeAction
@@ -9206,20 +9899,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
-bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
- return Size == 128;
+ return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
- return Size <= 128 ? AtomicRMWExpansionKind::LLSC
- : AtomicRMWExpansionKind::None;
+ return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
-bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
return true;
}
@@ -9258,6 +9952,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
cast<PointerType>(Addr->getType())->getElementType());
}
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+ IRBuilder<> &Builder) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Builder.CreateCall(
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
@@ -9294,3 +9995,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
return Ty->isArrayTy();
}
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+ EVT) const {
+ return false;
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x48;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ // Update IsSplitCSR in AArch64unctionInfo.
+ AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AArch64::GPR64RegClass.contains(*I))
+ RC = &AArch64::GPR64RegClass;
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RC = &AArch64::FPR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+ NewVR)
+ .addReg(*I);
+
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+ *I)
+ .addReg(NewVR);
+ }
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index c73ce1e54b3e..e99616c94068 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#include "AArch64.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/CallingConv.h"
@@ -58,13 +59,14 @@ enum NodeType : unsigned {
SBCS,
ANDS,
+ // Conditional compares. Operands: left,right,falsecc,cc,flags
+ CCMP,
+ CCMN,
+ FCCMP,
+
// Floating point comparison
FCMP,
- // Floating point max and min instructions.
- FMAX,
- FMIN,
-
// Scalar extract
EXTR,
@@ -217,8 +219,6 @@ class AArch64Subtarget;
class AArch64TargetMachine;
class AArch64TargetLowering : public TargetLowering {
- bool RequireStrictAlign;
-
public:
explicit AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI);
@@ -226,46 +226,35 @@ public:
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
- /// computeKnownBitsForTargetNode - Determine which of the bits specified in
- /// Mask are known to be either zero or one and return them in the
- /// KnownZero/KnownOne bitsets.
+ /// Determine which of the bits specified in Mask are known to be either zero
+ /// or one and return them in the KnownZero/KnownOne bitsets.
void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
APInt &KnownOne, const SelectionDAG &DAG,
unsigned Depth = 0) const override;
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
- /// allowsMisalignedMemoryAccesses - Returns true if the target allows
- /// unaligned memory accesses of the specified type.
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
unsigned Align = 1,
- bool *Fast = nullptr) const override {
- if (RequireStrictAlign)
- return false;
- // FIXME: True for Cyclone, but not necessary others.
- if (Fast)
- *Fast = true;
- return true;
- }
+ bool *Fast = nullptr) const override;
- /// LowerOperation - Provide custom lowering hooks for some operations.
+ /// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
const char *getTargetNodeName(unsigned Opcode) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
- /// getFunctionAlignment - Return the Log2 alignment of this function.
- unsigned getFunctionAlignment(const Function *F) const;
-
/// Returns true if a cast between SrcAS and DestAS is a noop.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
// Addrspacecasts are always noops.
return true;
}
- /// createFastISel - This method returns a target specific FastISel object,
- /// or null if the target does not support "fast" ISel.
+ /// This method returns a target specific FastISel object, or null if the
+ /// target does not support "fast" ISel.
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const override;
@@ -273,11 +262,11 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
- /// isShuffleMaskLegal - Return true if the given shuffle mask can be
- /// codegen'd directly, or if it should be stack expanded.
+ /// Return true if the given shuffle mask can be codegen'd directly, or if it
+ /// should be stack expanded.
bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
- /// getSetCCResultType - Return the ISD::SETCC ValueType
+ /// Return the ISD::SETCC ValueType.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -322,8 +311,8 @@ public:
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const override;
- /// isLegalAddressingMode - Return true if the addressing mode represented
- /// by AM is legal for this target, for a load/store of the specified type.
+ /// Return true if the addressing mode represented by AM is legal for this
+ /// target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
@@ -335,10 +324,9 @@ public:
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
- /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
- /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
- /// expanded to FMAs when this method returns true, otherwise fmuladd is
- /// expanded to fmul + fadd.
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+ /// returns true, otherwise fmuladd is expanded to fmul + fadd.
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
@@ -351,25 +339,65 @@ public:
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool hasLoadLinkedStoreConditional() const override;
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
- bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- TargetLoweringBase::AtomicRMWExpansionKind
+ TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
bool useLoadStackGuardNode() const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
+ /// If the target has a standard location for the unsafe stack pointer,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X1;
+ }
+
+ bool isCheapToSpeculateCttz() const override {
+ return true;
+ }
+
+ bool isCheapToSpeculateCtlz() const override {
+ return true;
+ }
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
private:
bool isExtFreeImpl(const Instruction *Ext) const override;
- /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
@@ -392,6 +420,8 @@ private:
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
bool isThisReturn, SDValue ThisVal) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
bool isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet,
@@ -470,7 +500,7 @@ private:
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
- bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+ unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,
@@ -516,6 +546,8 @@ private:
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
CallingConv::ID CallConv,
bool isVarArg) const override;
+
+ bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
};
namespace AArch64 {
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 3f2e772a90c4..6ac2175e5035 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
let PrintMethod = "printImmScale<16>";
}
+def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
@@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT>
let ParserMatchClass = Imm1_64Operand;
}
+def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
@@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
let ParserMatchClass = Imm1_32Operand;
}
+def Imm0_1Operand : AsmImmRange<0, 1>;
def Imm0_7Operand : AsmImmRange<0, 7>;
def Imm0_15Operand : AsmImmRange<0, 15>;
def Imm0_31Operand : AsmImmRange<0, 31>;
@@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_31Operand;
}
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_1 predicate - True if the immediate is in the range [0,1]
+def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = Imm0_1Operand;
+}
+
// imm0_15 predicate - True if the immediate is in the range [0,15]
def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 16;
@@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 16;
-}]>;
+}]> {
+ let ParserMatchClass = Imm0_15Operand;
+}
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
@@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
}
// Floating-point immediate.
+def fpimm16 : Operand<f16>,
+ PatLeaf<(f16 fpimm), [{
+ return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP16Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
def fpimm32 : Operand<f32>,
PatLeaf<(f32 fpimm), [{
return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
@@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
// model patterns with sufficiently fine granularity
let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
class HintI<string mnemonic>
- : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+ : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
[(int_aarch64_hint imm0_127:$imm)]>,
Sched<[WriteHint]> {
bits <7> imm;
@@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> {
let PrintMethod = "printMSRSystemRegister";
}
+def PSBHintOperand : AsmOperandClass {
+ let Name = "PSBHint";
+ let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+ let ParserMatchClass = PSBHintOperand;
+ let PrintMethod = "printPSBHintOp";
+ let MCOperandPredicate = [{
+ // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+ // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+ if (!MCOp.isImm())
+ return false;
+ bool ValidNamed;
+ (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
+ STI.getFeatureBits(), ValidNamed);
+ return ValidNamed;
+ }];
+}
+
class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
"mrs", "\t$Rt, $systemreg"> {
bits<16> systemreg;
@@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
let Inst{20-5} = systemreg;
}
-def SystemPStateFieldOperand : AsmOperandClass {
- let Name = "SystemPStateField";
+def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_15";
let ParserMethod = "tryParseSysReg";
}
-def pstatefield_op : Operand<i32> {
- let ParserMatchClass = SystemPStateFieldOperand;
+def pstatefield4_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
let PrintMethod = "printSystemPStateField";
}
let Defs = [NZCV] in
-class MSRpstateI
- : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
- "msr", "\t$pstate_field, $imm">,
+class MSRpstateImm0_15
+ : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm),
+ "msr", "\t$pstatefield, $imm">,
Sched<[WriteSys]> {
bits<6> pstatefield;
bits<4> imm;
@@ -913,6 +968,37 @@ class MSRpstateI
let Inst{7-5} = pstatefield{2-0};
let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
+}
+
+def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_1";
+ let ParserMethod = "tryParseSysReg";
+}
+def pstatefield1_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
+ let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_1
+ : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm),
+ "msr", "\t$pstatefield, $imm">,
+ Sched<[WriteSys]> {
+ bits<6> pstatefield;
+ bit imm;
+ let Inst{20-19} = 0b00;
+ let Inst{18-16} = pstatefield{5-3};
+ let Inst{15-9} = 0b0100000;
+ let Inst{8} = imm;
+ let Inst{7-5} = pstatefield{2-0};
+
+ let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
}
// SYS and SYSL generic system instructions.
@@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
}
class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
@@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode>
}
class MulAccumWAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
class MulAccumXAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
class WideMulAccumAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
@@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
RegisterClass src1Regtype, RegisterClass src2Regtype,
int shiftExt>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
shiftExt)>;
@@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
}
// add Rd, Rb, -imm -> sub Rd, Rn, imm
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
@@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
} // Defs = [NZCV]
// Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
// Compare aliases
- def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
- def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
- def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
// Compare shorthands
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
// Register/register aliases with no shift when SP is not used.
@@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
// Aliases for register+register logical instructions.
class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
@@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
let Inst{31} = 1;
}
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
}
} // end Defs = [NZCV]
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
//---
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+ string mnemonic, SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
@@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
let Inst{3-0} = nzcv;
}
-multiclass CondSetFlagsImm<bit op, string asm> {
- def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
- let Inst{31} = 0;
- }
- def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
- let Inst{31} = 1;
- }
-}
-
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+ SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
@@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
let Inst{3-0} = nzcv;
}
-multiclass CondSetFlagsReg<bit op, string asm> {
- def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+ // immediate operand variants
+ def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
let Inst{31} = 0;
}
- def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+ def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ // register operand variants
+ def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
}
class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
- : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+ : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
@@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
(ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
asm, pat>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
(ins GPR64sp:$Rn, simm9:$offset), asm>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
asm>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
(ins GPR64sp:$Rn, indextype:$offset), asm>,
Sched<[WriteLD, WriteLDHi]>;
- def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
}
@@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
asm>,
Sched<[WriteSTP]>;
- def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
}
@@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
let mayStore = 1, mayLoad = 0 in
class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
Operand idxtype, string asm>
- : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
- (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+ : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
+ (ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, idxtype:$offset),
asm>,
Sched<[WriteAdr, WriteSTP]>;
@@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
+ // Unscaled half-precision to 32-bit
+ def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
+ [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ // Unscaled half-precision to 64-bit
+ def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
+ [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
// Unscaled single-precision to 32-bit
def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
[(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
@@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
+ // Scaled half-precision to 32-bit
+ def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
+ fixedpoint_f16_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
+ // Scaled half-precision to 64-bit
+ def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
+ fixedpoint_f16_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
// Scaled single-precision to 32-bit
def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
fixedpoint_f32_i32, asm,
@@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned,
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b00001;
let Inst{16} = isUnsigned;
let Inst{15-10} = scale;
@@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b10001;
let Inst{16} = isUnsigned;
let Inst{15-10} = 0b000000;
@@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
// Unscaled
+ def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
}
def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
// Scaled
+ def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f16_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
[(set FPR32:$Rd,
(fdiv (node GPR32:$Rn),
fixedpoint_f32_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
let scale{5} = 1;
}
@@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
(fdiv (node GPR32:$Rn),
fixedpoint_f64_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
let scale{5} = 1;
}
+ def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f16_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
[(set FPR32:$Rd,
(fdiv (node GPR64:$Rn),
fixedpoint_f32_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
@@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
(fdiv (node GPR64:$Rn),
fixedpoint_f64_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
}
@@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
Sched<[WriteFCopy]> {
bits<5> Rd;
bits<5> Rn;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
@@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
}
-
multiclass UnscaledConversion<string asm> {
+ def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
}
def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
@@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
Sched<[WriteF]> {
bits<5> Rd;
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21-19} = 0b100;
let Inst{18-15} = opcode;
let Inst{14-10} = 0b10000;
@@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
multiclass SingleOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
+ def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = opcode;
@@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
multiclass TwoOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set (f16 FPR16:$Rd),
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set (f32 FPR32:$Rd),
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set (f64 FPR64:$Rd),
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
bits<5> Rn;
bits<5> Rm;
bits<5> Ra;
- let Inst{31-23} = 0b000111110;
+ let Inst{31-24} = 0b00011111;
let Inst{21} = isNegated;
let Inst{20-16} = Rm;
let Inst{15} = isSub;
@@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
SDPatternOperator node> {
+ def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
+ [(set FPR16:$Rd,
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
[(set FPR32:$Rd,
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
[(set FPR64:$Rd,
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans,
: I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
Sched<[WriteFCmp]> {
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{15-10} = 0b001000;
@@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
Sched<[WriteFCmp]> {
bits<5> Rm;
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001000;
@@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
multiclass FPComparison<bit signalAllNans, string asm,
SDPatternOperator OpNode = null_frag> {
let Defs = [NZCV] in {
+ def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
} // Defs = [NZCV]
}
@@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm,
//---
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
- RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+ string mnemonic, list<dag> pat>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
Sched<[WriteFCmp]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
bits<5> Rn;
bits<5> Rm;
bits<4> nzcv;
bits<4> cond;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
@@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans,
let Inst{3-0} = nzcv;
}
-multiclass FPCondComparison<bit signalAllNans, string asm> {
- let Defs = [NZCV], Uses = [NZCV] in {
- def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
- let Inst{22} = 0;
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
}
- def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
- let Inst{22} = 1;
+ def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+ [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+ [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b01;
}
- } // Defs = [NZCV], Uses = [NZCV]
}
//---
@@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
bits<5> Rm;
bits<4> cond;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
@@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
multiclass FPCondSelect<string asm> {
let Uses = [NZCV] in {
+ def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
} // Uses = [NZCV]
}
@@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
Sched<[WriteFImm]> {
bits<5> Rd;
bits<8> imm;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-13} = imm;
let Inst{12-5} = 0b10000000;
@@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
}
multiclass FPMoveImmediate<string asm> {
+ def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
}
} // end of 'let Predicates = [HasFPARMv8]'
@@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in {
//----------------------------------------------------------------------------
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
}
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
@@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
// All operand sizes distinguished in the encoding.
multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
- def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
}
@@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
// As above, but D sized elements unsupported.
multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
}
multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
- def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
@@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
// As above, but only B sized elements supported.
multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd),
(OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
}
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$dst),
+ (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$dst),
+ (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$dst),
(OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$dst),
(OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$dst),
(OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
@@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
// As above, but D and B sized elements unsupported.
multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
}
@@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
// Logical three vector ops share opcode bits, and only use B sized elements.
multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
SDPatternOperator OpNode = null_frag> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
@@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
string asm, SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
@@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype, string asm, string dstkind,
- string srckind, list<dag> pattern>
+ bits<2> size2, RegisterOperand regtype, string asm,
+ string dstkind, string srckind, list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
@@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype, string asm, string dstkind,
- string srckind, list<dag> pattern>
+ bits<2> size2, RegisterOperand regtype,
+ string asm, string dstkind, string srckind,
+ list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
@@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
// Supports B, H, and S element sizes.
multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
@@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS {
// Supports all element sizes.
multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
(v8i8 V64:$Rn)))]>;
- def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
(v16i8 V128:$Rn)))]>;
- def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
(v4i16 V64:$Rn)))]>;
- def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v8i16 V128:$Rn)))]>;
- def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
(v2i32 V64:$Rn)))]>;
- def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
(v4i32 V128:$Rn)))]>;
@@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
// Supports all element sizes, except 1xD.
multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
- def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
}
multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
- def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
}
@@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
// Supports only B element sizes.
multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
@@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
// Supports only B and H element sizes.
multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
}
@@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
// as an extra opcode bit.
multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
}
@@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
// Supports only S element size.
multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
@@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
}
multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
}
@@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
}
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype,
- string asm, string kind, string zero,
- ValueType dty, ValueType sty, SDNode OpNode>
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+ bits<5> opcode, RegisterOperand regtype, string asm,
+ string kind, string zero, ValueType dty,
+ ValueType sty, SDNode OpNode>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
"|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
@@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
// Comparisons support all element sizes, except 1xD.
multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
SDNode OpNode> {
- def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+ def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
asm, ".8b", "0",
v8i8, v8i8, OpNode>;
- def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+ def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
asm, ".16b", "0",
v16i8, v16i8, OpNode>;
- def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
asm, ".4h", "0",
v4i16, v4i16, OpNode>;
- def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
asm, ".8h", "0",
v8i16, v8i16, OpNode>;
- def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
asm, ".2s", "0",
v2i32, v2i32, OpNode>;
- def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
asm, ".4s", "0",
v4i32, v4i32, OpNode>;
- def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
asm, ".2d", "0",
v2i64, v2i64, OpNode>;
}
-// FP Comparisons support only S and D element sizes.
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
string asm, SDNode OpNode> {
- def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+ asm, ".4h", "0.0",
+ v4i16, v4f16, OpNode>;
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+ asm, ".8h", "0.0",
+ v8i16, v8f16, OpNode>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
asm, ".2s", "0.0",
v2i32, v2f32, OpNode>;
- def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
asm, ".4s", "0.0",
v4i32, v4f32, OpNode>;
- def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
asm, ".2d", "0.0",
v2i64, v2f64, OpNode>;
- def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
- def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+ def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+ def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
- def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+ def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+ def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
}
@@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
//----------------------------------------------------------------------------
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
RegisterClass regtype, string asm,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
}
multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
- def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
- def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
- def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+ def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
@@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
- def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
}
multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
@@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
asm, []>;
}
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
}
def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
}
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ []>;
+ } // Predicates = [HasNEON, HasFullFP16]
}
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
//----------------------------------------------------------------------------
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, RegisterClass regtype2,
string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
@@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, string asm, string zero>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"\t$Rd, $Rn, #" # zero, "", []>,
@@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
def : Pat<(v1i64 (OpNode FPR64:$Rn)),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
}
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
- def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+ def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+ }
- def : InstAlias<asm # " $Rd, $Rn, #0",
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
- def : InstAlias<asm # " $Rd, $Rn, #0",
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+ }
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
(!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
}
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
- def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
- def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+ }
}
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
- def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+ }
}
multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
- def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
- def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
- def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
}
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
@@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
- def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
- def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
}
//----------------------------------------------------------------------------
@@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
asm, ".2d">;
}
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
- def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+ asm, ".2h">;
+ }
+ def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
asm, ".2s">;
- def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+ def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
asm, ".2d">;
}
@@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
asm, ".4s", []>;
}
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
Intrinsic intOp> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+ asm, ".4h",
+ [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+ def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+ asm, ".8h",
+ [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
asm, ".4s",
[(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst,
class SIMDInsElementMovAlias<string size, Instruction inst,
Operand idxtype>
: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
- # "|" # size #" $dst$idx, $src$idx2}",
+ # "|" # size #"\t$dst$idx, $src$idx2}",
(inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
@@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> {
// AdvSIMD modified immediate instructions
//----------------------------------------------------------------------------
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
string asm, string op_string,
string cstr, list<dag> pattern>
: I<oops, iops, asm, op_string, cstr, pattern>,
@@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
let Inst{29} = op;
let Inst{28-19} = 0b0111100000;
let Inst{18-16} = imm8{7-5};
- let Inst{11-10} = 0b01;
+ let Inst{11} = op2;
+ let Inst{10} = 1;
let Inst{9-5} = imm8{4-0};
let Inst{4-0} = Rd;
}
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+ : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
!con((ins immtype:$imm8), opt_shift_iop), asm,
"{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+ : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
!con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
@@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_hw_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
@@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins move_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<1> shift;
@@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
let Inst{12} = shift;
}
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
RegisterOperand vectype,
Operand imm_type, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+ : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
asm, kind, pattern> {
let Inst{15-12} = cmode;
}
class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+ : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
"\t$Rd, $imm8", "", pattern> {
let Inst{15-12} = cmode;
let DecoderMethod = "DecodeModImmInstruction";
@@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
let Inst{4-0} = Rd;
}
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
- SDPatternOperator OpNode> {
+multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+ V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4f16 V64:$Rd),
+ (OpNode (v4f16 V64:$Rn),
+ (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8f16 V128:$Rd),
+ (OpNode (v8f16 V128:$Rn),
+ (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
@@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
let Inst{21} = 0;
}
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h",
+ [(set (f16 FPR16Op:$Rd),
+ (OpNode (f16 FPR16Op:$Rn),
+ (f16 (vector_extract (v8f16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
@@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
}
}
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
@@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
V128:$Rm, VectorIndexD:$idx)>;
}
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s", []> {
@@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
let Inst{21} = 0;
}
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
}
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
@@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
@@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
}
}
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
@@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind,
+ : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
pattern> {
- let Inst{21}=0;
}
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator Accum> {
@@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">;
def : TokenAlias<".4S", ".4s">;
def : TokenAlias<".2D", ".2d">;
def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
def : TokenAlias<".B", ".b">;
def : TokenAlias<".H", ".h">;
def : TokenAlias<".S", ".s">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index c0b3f2c60916..3ef3c8b840cb 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
-#include "AArch64MachineCombinerPattern.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
CC);
}
+/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
+static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
+ uint64_t Imm = MI->getOperand(1).getImm();
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+}
+
// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
@@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ORRWrr:
case AArch64::ORRXrr:
return true;
+ // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
+ // ORRXri, it is as cheap as MOV
+ case AArch64::MOVi32imm:
+ return canBeExpandedToORR(MI, 32);
+ case AArch64::MOVi64imm:
+ return canBeExpandedToORR(MI, 64);
}
llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
Width = 1;
Scale = 1;
break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ Scale = Width = 16;
+ break;
case AArch64::LDRXui:
+ case AArch64::LDRDui:
case AArch64::STRXui:
+ case AArch64::STRDui:
Scale = Width = 8;
break;
case AArch64::LDRWui:
+ case AArch64::LDRSui:
case AArch64::STRWui:
+ case AArch64::STRSui:
Scale = Width = 4;
break;
- case AArch64::LDRBui:
- case AArch64::STRBui:
- Scale = Width = 1;
- break;
case AArch64::LDRHui:
+ case AArch64::LDRHHui:
case AArch64::STRHui:
+ case AArch64::STRHHui:
Scale = Width = 2;
break;
- case AArch64::LDRSui:
- case AArch64::STRSui:
- Scale = Width = 4;
- break;
- case AArch64::LDRDui:
- case AArch64::STRDui:
- Scale = Width = 8;
- break;
- case AArch64::LDRQui:
- case AArch64::STRQui:
- Scale = Width = 16;
- break;
+ case AArch64::LDRBui:
case AArch64::LDRBBui:
+ case AArch64::STRBui:
case AArch64::STRBBui:
Scale = Width = 1;
break;
- case AArch64::LDRHHui:
- case AArch64::STRHHui:
- Scale = Width = 2;
- break;
};
BaseReg = LdSt->getOperand(1).getReg();
@@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
MachineInstr *Second) const {
- // Cyclone can fuse CMN, CMP followed by Bcc.
-
- // FIXME: B0 can also fuse:
- // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
- if (Second->getOpcode() != AArch64::Bcc)
- return false;
- switch (First->getOpcode()) {
- default:
- return false;
- case AArch64::SUBSWri:
- case AArch64::ADDSWri:
- case AArch64::ANDSWri:
- case AArch64::SUBSXri:
- case AArch64::ADDSXri:
- case AArch64::ANDSXri:
- return true;
+ if (Subtarget.isCyclone()) {
+ // Cyclone can fuse CMN, CMP, TST followed by Bcc.
+ unsigned SecondOpcode = Second->getOpcode();
+ if (SecondOpcode == AArch64::Bcc) {
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ case AArch64::ANDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::ADDSXri:
+ case AArch64::ANDSXri:
+ return true;
+ }
+ }
+ // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+ if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+ SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ case AArch64::EORWri:
+ case AArch64::EORXri:
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri:
+ return true;
+ }
+ }
}
+ return false;
}
MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
@@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
unsigned Opc = 0;
@@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
@@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
case AArch64::LDPDi:
case AArch64::STPXi:
case AArch64::STPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 8;
break;
case AArch64::LDPQi:
case AArch64::STPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 16;
break;
@@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
case AArch64::LDPSi:
case AArch64::STPWi:
case AArch64::STPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 4;
break;
@@ -2457,7 +2495,7 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
unsigned Opc = Root.getOpcode();
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
@@ -2485,76 +2523,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
"ADDWrr does not have register operands");
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
Found = true;
}
break;
case AArch64::ADDXrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
Found = true;
}
break;
case AArch64::SUBWrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
Found = true;
}
break;
case AArch64::SUBXrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
Found = true;
}
break;
case AArch64::ADDWri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
Found = true;
}
break;
case AArch64::ADDXri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
Found = true;
}
break;
case AArch64::SUBWri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
Found = true;
}
break;
case AArch64::SUBXri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
Found = true;
}
break;
@@ -2661,7 +2699,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
/// this function generates the instructions that could replace the
/// original code sequence
void AArch64InstrInfo::genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+ MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
@@ -2677,13 +2715,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
default:
// signal error.
break;
- case MachineCombinerPattern::MC_MULADDW_OP1:
- case MachineCombinerPattern::MC_MULADDX_OP1:
+ case MachineCombinerPattern::MULADDW_OP1:
+ case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0
// ADD R,I,C
// ==> MADD R,A,B,C
// --- Create(MADD);
- if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+ if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2692,13 +2730,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULADDW_OP2:
- case MachineCombinerPattern::MC_MULADDX_OP2:
+ case MachineCombinerPattern::MULADDW_OP2:
+ case MachineCombinerPattern::MULADDX_OP2:
// MUL I=A,B,0
// ADD R,C,I
// ==> MADD R,A,B,C
// --- Create(MADD);
- if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+ if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2707,8 +2745,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULADDWI_OP1:
- case MachineCombinerPattern::MC_MULADDXI_OP1: {
+ case MachineCombinerPattern::MULADDWI_OP1:
+ case MachineCombinerPattern::MULADDXI_OP1: {
// MUL I=A,B,0
// ADD R,I,Imm
// ==> ORR V, ZR, Imm
@@ -2716,7 +2754,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+ if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
@@ -2751,8 +2789,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
}
- case MachineCombinerPattern::MC_MULSUBW_OP1:
- case MachineCombinerPattern::MC_MULSUBX_OP1: {
+ case MachineCombinerPattern::MULSUBW_OP1:
+ case MachineCombinerPattern::MULSUBX_OP1: {
// MUL I=A,B,0
// SUB R,I, C
// ==> SUB V, 0, C
@@ -2760,7 +2798,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *SubRC;
unsigned SubOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
SubOpc = AArch64::SUBWrr;
SubRC = &AArch64::GPR32spRegClass;
ZeroReg = AArch64::WZR;
@@ -2784,13 +2822,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
break;
}
- case MachineCombinerPattern::MC_MULSUBW_OP2:
- case MachineCombinerPattern::MC_MULSUBX_OP2:
+ case MachineCombinerPattern::MULSUBW_OP2:
+ case MachineCombinerPattern::MULSUBX_OP2:
// MUL I=A,B,0
// SUB R,C,I
// ==> MSUB R,A,B,C (computes C - A*B)
// --- Create(MSUB);
- if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
Opc = AArch64::MSUBWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2799,8 +2837,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULSUBWI_OP1:
- case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+ case MachineCombinerPattern::MULSUBWI_OP1:
+ case MachineCombinerPattern::MULSUBXI_OP1: {
// MUL I=A,B,0
// SUB R,I, Imm
// ==> ORR V, ZR, -Imm
@@ -2808,7 +2846,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+ if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
@@ -2944,3 +2982,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
MI->eraseFromParent();
return true;
}
+
+std::pair<unsigned, unsigned>
+AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = AArch64II::MO_FRAGMENT;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_PAGE, "aarch64-page"},
+ {MO_PAGEOFF, "aarch64-pageoff"},
+ {MO_G3, "aarch64-g3"},
+ {MO_G2, "aarch64-g2"},
+ {MO_G1, "aarch64-g1"},
+ {MO_G0, "aarch64-g0"},
+ {MO_HI12, "aarch64-hi12"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT, "aarch64-got"},
+ {MO_NC, "aarch64-nc"},
+ {MO_TLS, "aarch64-tls"},
+ {MO_CONSTPOOL, "aarch64-constant-pool"}};
+ return makeArrayRef(TargetFlags);
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 68c2a2882580..ae02822a32e6 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -167,13 +167,13 @@ public:
/// for an instruction chain ending in <Root>. All potential patterns are
/// listed in the <Patterns> array.
bool getMachineCombinerPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns)
+ SmallVectorImpl<MachineCombinerPattern> &Patterns)
const override;
/// When getMachineCombinerPatterns() finds patterns, this function generates
/// the instructions that could replace the original code sequence
void genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+ MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
@@ -181,6 +181,14 @@ public:
bool useMachineCombiner() const override;
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
private:
void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
MachineBasicBlock *TBB,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index fa1a46acba84..d02bc9ff394d 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -16,6 +16,8 @@
//
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
@@ -24,6 +26,12 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
+def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE : Predicate<"Subtarget->hasSPE()">,
+ AssemblerPredicate<"FeatureSPE", "spe">;
+
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsCyclone : Predicate<"Subtarget->isCyclone()">;
@@ -66,6 +74,20 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisInt<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisFP<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
def SDT_AArch64FCmp : SDTypeProfile<0, 2,
[SDTCisFP<0>,
SDTCisSameAs<0, 1>]>;
@@ -160,13 +182,14 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;
+def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
+def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
+def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
-def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
-def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
-
def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
@@ -361,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
// As far as LLVM is concerned this writes to the system's exclusive monitors.
let mayLoad = 1, mayStore = 1 in
def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
@@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>;
def MRS : MRSI;
def MSR : MSRI;
-def MSRpstate: MSRpstateI;
+def MSRpstateImm1 : MSRpstateImm0_1;
+def MSRpstateImm4 : MSRpstateImm0_15;
// The thread pointer (on Linux, at least, where this has been implemented) is
// TPIDR_EL0.
def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// The cycle counter PMC register is PMCCNTR_EL0.
+let Predicates = [HasPerfMon] in
+def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
def SYSLxt : SystemLXtI<1, "sysl">;
@@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
(SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+let AddedComplexity = 1 in {
def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
(SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+}
// Because of the immediate format for add/sub-imm instructions, the
// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
@@ -823,7 +856,7 @@ defm AND : LogicalReg<0b00, 0, "and", and>;
defm BIC : LogicalReg<0b00, 1, "bic",
BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
defm EON : LogicalReg<0b10, 1, "eon",
- BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+ BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
defm EOR : LogicalReg<0b10, 0, "eor", xor>;
defm ORN : LogicalReg<0b01, 1, "orn",
BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
@@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
+// Conditional comparison instructions.
//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
//===----------------------------------------------------------------------===//
// Conditional select instructions.
@@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt
defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
}
+multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">;
+
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
@@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
(FRINTNDr FPR64:$Rn)>;
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
let SchedRW = [WriteFDiv] in {
@@ -2488,23 +2531,23 @@ defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
let SchedRW = [WriteFDiv] in {
defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
-defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
+defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
+defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>;
let SchedRW = [WriteFMul] in {
defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
}
defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
-def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
//===----------------------------------------------------------------------===//
@@ -2556,7 +2599,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
//===----------------------------------------------------------------------===//
defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP : FPCondComparison<0, "fccmp">;
+defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;
//===----------------------------------------------------------------------===//
// Floating point conditional select instruction.
@@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">;
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+ int_aarch64_neon_uabd>;
+// Match UABDL in log2-shuffle patterns.
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (v8i8 V64:$opA)),
+ (zext (v8i8 V64:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
+ (zext (extract_high_v16i8 V128:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (v4i16 V64:$opA)),
+ (zext (v4i16 V64:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
+ (zext (extract_high_v8i16 V128:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (v2i32 V64:$opA)),
+ (zext (v2i32 V64:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
+ (zext (extract_high_v4i32 V128:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
+
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
(v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
@@ -2780,29 +2857,29 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
-defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
-defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
-defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
-defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
// The following def pats catch the case where the LHS of an FMA is negated.
@@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
(FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
@@ -2833,9 +2910,9 @@ defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
-defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
-defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
@@ -2852,9 +2929,9 @@ defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
-defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
-defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
@@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)),
- (SMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)),
- (SMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)),
- (SMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)),
- (SMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)),
- (SMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)),
- (SMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)),
- (SMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)),
- (SMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)),
- (SMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)),
- (SMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)),
- (SMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)),
- (SMAXv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)),
- (UMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)),
- (UMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)),
- (UMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)),
- (UMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)),
- (UMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)),
- (UMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)),
- (UMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)),
- (UMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)),
- (UMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)),
- (UMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)),
- (UMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)),
- (UMAXv4i32 V128:$Rn, V128:$Rm)>;
def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
@@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|cmlt.2d\t$dst, $src1, $src2}",
(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmle.4h\t$dst, $src1, $src2}",
+ (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmle.8h\t$dst, $src1, $src2}",
+ (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmle.2s\t$dst, $src1, $src2}",
(FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmle.2d\t$dst, $src1, $src2}",
(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmlt.4h\t$dst, $src1, $src2}",
+ (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmlt.8h\t$dst, $src1, $src2}",
+ (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmlt.2s\t$dst, $src1, $src2}",
(FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmlt.2d\t$dst, $src1, $src2}",
(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|facle.4h\t$dst, $src1, $src2}",
+ (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|facle.8h\t$dst, $src1, $src2}",
+ (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
"|facle.2s\t$dst, $src1, $src2}",
(FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
"|facle.2d\t$dst, $src1, $src2}",
(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|faclt.4h\t$dst, $src1, $src2}",
+ (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|faclt.8h\t$dst, $src1, $src2}",
+ (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
"|faclt.2s\t$dst, $src1, $src2}",
(FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3103,19 +3164,19 @@ defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
int_aarch64_neon_facge>;
-defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
int_aarch64_neon_facgt>;
-defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -3198,35 +3259,35 @@ defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
-defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
-defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
-defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
-defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
-defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">;
+defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">;
+defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
int_aarch64_neon_suqadd>;
-defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
int_aarch64_neon_usqadd>;
@@ -3390,8 +3451,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
int_aarch64_neon_uabd>;
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- int_aarch64_neon_uabd>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
@@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns<
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
- (vector_extract (v2i64 V128:$Rm), (i64 1))),
+def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
+ (extractelt (v2i64 V128:$Rm), (i64 1))),
(PMULLv2i64 V128:$Rn, V128:$Rm)>;
// CodeGen patterns for addhn and subhn instructions, which can actually be
@@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">;
//----------------------------------------------------------------------------
defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
imm:$idx))))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
- def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
- imm:$idx))))),
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
+ imm:$idx))))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}
@@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
(i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+ VectorIndexB:$idx)))), i8),
+ (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+ VectorIndexH:$idx)))), i16),
+ (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+
// Extracting i8 or i16 elements will have the zero-extend transformed to
// an 'and' mask by type legalization since neither i8 nor i16 are legal types
// for AArch64. Match these patterns here since UMOV already zeroes out the high
@@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -3949,10 +4020,10 @@ defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
-defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
-defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
@@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
"fmov", ".2d",
[(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8,
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
"fmov", ".2s",
[(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
"fmov", ".4s",
[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
+ "fmov", ".4h",
+ [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+ "fmov", ".8h",
+ [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
// AdvSIMD MOVI
@@ -4235,7 +4314,7 @@ def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>;
// The movi_edit node has the immediate value already encoded, so we use
// a plain imm0_255 in the pattern
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
simdimmtype10,
"movi", ".2d",
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
@@ -4296,10 +4375,10 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
// Per byte: 8b & 16b
-def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255,
+def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
"movi", ".8b",
[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
"movi", ".16b",
[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
@@ -4340,8 +4419,8 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
//----------------------------------------------------------------------------
let hasSideEffects = 0 in {
- defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
- defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+ defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
+ defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
}
// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
@@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in {
// On the other hand, there are quite a few valid combinatorial options due to
// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
@@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
- (vector_extract (v2f32 (fneg V64:$Rm)),
+ (vector_extract (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
VectorIndexS:$idx))),
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
@@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns<
defm : FMLSIndexedAfterNegPatterns<
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv2i32_indexed V64:$Rn,
@@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
//----------------------------------------------------------------------------
// AdvSIMD scalar shift instructions
//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
// Codegen patterns for the above. We don't put these directly on the
// instructions because TableGen's type inference can't handle the truth.
// Having the same base pattern for fp <--> int totally freaks it out.
@@ -4573,7 +4654,7 @@ defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
//----------------------------------------------------------------------------
defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
int_aarch64_neon_vcvtfxs2fp>;
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
int_aarch64_neon_rshrn>;
@@ -4608,7 +4689,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
int_aarch64_neon_vcvtfxu2fp>;
defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
int_aarch64_neon_uqrshrn>;
@@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
def : Pat<(i64 (anyext GPR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
+// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
+// then assert the extension has happened.
def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+ (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
// To sign extend, we use a signed bitfield move instruction (SBFM) on the
// containing super-reg.
@@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
(v16i8 (REV16v16i8 FPR128:$src))>;
}
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+
def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
@@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR128:$Rt),
+ (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+ (CPYi64 FPR128:$Rt, (i64 1)),
+ GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR64:$Rt),
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+ (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 82f77a77ab5e..566aa2c9a9ba 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
+STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
-// Place holder while testing unscaled load/store combining
-static cl::opt<bool> EnableAArch64UnscaledMemOp(
- "aarch64-unscaled-mem-op", cl::Hidden,
- cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
+namespace llvm {
+void initializeAArch64LoadStoreOptPass(PassRegistry &);
+}
+
+#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
namespace {
+
+typedef struct LdStPairFlags {
+ // If a matching instruction is found, MergeForward is set to true if the
+ // merge is to remove the first instruction and replace the second with
+ // a pair-wise insn, and false if the reverse is true.
+ bool MergeForward;
+
+ // SExtIdx gives the index of the result of the load pair that must be
+ // extended. The value of SExtIdx assumes that the paired load produces the
+ // value in this order: (I, returned iterator), i.e., -1 means no value has
+ // to be extended, 0 means I, and 1 means the returned iterator.
+ int SExtIdx;
+
+ LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+
+ void setMergeForward(bool V = true) { MergeForward = V; }
+ bool getMergeForward() const { return MergeForward; }
+
+ void setSExtIdx(int V) { SExtIdx = V; }
+ int getSExtIdx() const { return SExtIdx; }
+
+} LdStPairFlags;
+
struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;
- AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+ AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
+ initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
+ }
const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;
+ const AArch64Subtarget *Subtarget;
// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().
- // If a matching instruction is found, MergeForward is set to true if the
- // merge is to remove the first instruction and replace the second with
- // a pair-wise insn, and false if the reverse is true.
- // \p SExtIdx[out] gives the index of the result of the load pair that
- // must be extended. The value of SExtIdx assumes that the paired load
- // produces the value in this order: (I, returned iterator), i.e.,
- // -1 means no value has to be extended, 0 means I, and 1 means the
- // returned iterator.
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
- bool &MergeForward, int &SExtIdx,
+ LdStPairFlags &Flags,
unsigned Limit);
+
+ // Scan the instructions looking for a store that writes to the address from
+ // which the current load instruction reads. Return true if one is found.
+ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI);
+
// Merge the two instructions indicated into a single pair-wise instruction.
// If MergeForward is true, erase the first instruction and fold its
// operation into the second. If false, the reverse. Return the instruction
// following the first instruction (which may change during processing).
- // \p SExtIdx index of the result that must be extended for a paired load.
- // -1 means none, 0 means I, and 1 means Paired.
MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Paired, bool MergeForward,
- int SExtIdx);
+ MachineBasicBlock::iterator Paired,
+ const LdStPairFlags &Flags);
+
+ // Promote the load that reads directly from the address stored to.
+ MachineBasicBlock::iterator
+ promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI);
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan forwards.
MachineBasicBlock::iterator
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
- int Value);
+ int UnscaledOffset);
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
@@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
MachineBasicBlock::iterator
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
- // Merge a pre-index base register update into a ld/st instruction.
- MachineBasicBlock::iterator
- mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update);
+ // Find an instruction that updates the base register of the ld/st
+ // instruction.
+ bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+ unsigned BaseReg, int Offset);
- // Merge a post-index base register update into a ld/st instruction.
+ // Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
- mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update);
+ mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update, bool IsPreIdx);
+
+ // Find and merge foldable ldr/str instructions.
+ bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
- bool optimizeBlock(MachineBasicBlock &MBB);
+ // Find and promote load instructions which read directly from store.
+ bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
+ // Check if converting two narrow loads into a single wider load with
+ // bitfield extracts could be enabled.
+ bool enableNarrowLdMerge(MachineFunction &Fn);
+
+ bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "AArch64 load / store optimization pass";
+ return AARCH64_LOAD_STORE_OPT_NAME;
}
-
-private:
- int getMemSize(MachineInstr *MemMI);
};
char AArch64LoadStoreOpt::ID = 0;
} // namespace
-static bool isUnscaledLdst(unsigned Opc) {
+INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
+ AARCH64_LOAD_STORE_OPT_NAME, false, false)
+
+static bool isUnscaledLdSt(unsigned Opc) {
switch (Opc) {
default:
return false;
case AArch64::STURSi:
- return true;
case AArch64::STURDi:
- return true;
case AArch64::STURQi:
- return true;
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
case AArch64::STURWi:
- return true;
case AArch64::STURXi:
- return true;
case AArch64::LDURSi:
- return true;
case AArch64::LDURDi:
- return true;
case AArch64::LDURQi:
- return true;
case AArch64::LDURWi:
- return true;
case AArch64::LDURXi:
- return true;
case AArch64::LDURSWi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
return true;
}
}
-// Size in bytes of the data moved by an unscaled load or store
-int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
- switch (MemMI->getOpcode()) {
+static bool isUnscaledLdSt(MachineInstr *MI) {
+ return isUnscaledLdSt(MI->getOpcode());
+}
+
+static unsigned getBitExtrOpcode(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode.");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ return AArch64::UBFMWri;
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ return AArch64::SBFMWri;
+ }
+}
+
+static bool isNarrowStore(unsigned Opc) {
+ switch (Opc) {
default:
- llvm_unreachable("Opcode has unknown size!");
+ return false;
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return true;
+ }
+}
+
+static bool isNarrowStore(MachineInstr *MI) {
+ return isNarrowStore(MI->getOpcode());
+}
+
+static bool isNarrowLoad(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ return true;
+ }
+}
+
+static bool isNarrowLoad(MachineInstr *MI) {
+ return isNarrowLoad(MI->getOpcode());
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Opcode has unknown scale!");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ return 1;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return 2;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
case AArch64::STRSui:
case AArch64::STURSi:
- return 4;
- case AArch64::STRDui:
- case AArch64::STURDi:
- return 8;
- case AArch64::STRQui:
- case AArch64::STURQi:
- return 16;
case AArch64::STRWui:
case AArch64::STURWi:
- return 4;
- case AArch64::STRXui:
- case AArch64::STURXi:
- return 8;
- case AArch64::LDRSui:
- case AArch64::LDURSi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::STPSi:
+ case AArch64::STPWi:
return 4;
case AArch64::LDRDui:
case AArch64::LDURDi:
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ case AArch64::LDPDi:
+ case AArch64::LDPXi:
+ case AArch64::STPDi:
+ case AArch64::STPXi:
return 8;
case AArch64::LDRQui:
case AArch64::LDURQi:
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
return 16;
- case AArch64::LDRWui:
- case AArch64::LDURWi:
- return 4;
- case AArch64::LDRXui:
- case AArch64::LDURXi:
- return 8;
- case AArch64::LDRSWui:
- case AArch64::LDURSWi:
- return 4;
}
}
@@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STURDi:
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
case AArch64::STRWui:
case AArch64::STURWi:
case AArch64::STRXui:
@@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STURSi:
case AArch64::LDRSui:
case AArch64::LDURSi:
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
return Opc;
case AArch64::LDRSWui:
return AArch64::LDRWui;
case AArch64::LDURSWi:
return AArch64::LDURWi;
+ case AArch64::LDRSBWui:
+ return AArch64::LDRBBui;
+ case AArch64::LDRSHWui:
+ return AArch64::LDRHHui;
+ case AArch64::LDURSBWi:
+ return AArch64::LDURBBi;
+ case AArch64::LDURSHWi:
+ return AArch64::LDURHHi;
}
}
@@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::STRQui:
case AArch64::STURQi:
return AArch64::STPQi;
+ case AArch64::STRBBui:
+ return AArch64::STRHHui;
+ case AArch64::STRHHui:
+ return AArch64::STRWui;
+ case AArch64::STURBBi:
+ return AArch64::STURHHi;
+ case AArch64::STURHHi:
+ return AArch64::STURWi;
case AArch64::STRWui:
case AArch64::STURWi:
return AArch64::STPWi;
@@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return AArch64::LDPSWi;
+ case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ return AArch64::LDRWui;
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHWi:
+ return AArch64::LDURWi;
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ return AArch64::LDRHHui;
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ return AArch64::LDURHHi;
+ }
+}
+
+static unsigned isMatchingStore(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ unsigned LdOpc = LoadInst->getOpcode();
+ unsigned StOpc = StoreInst->getOpcode();
+ switch (LdOpc) {
+ default:
+ llvm_unreachable("Unsupported load instruction!");
+ case AArch64::LDRBBui:
+ return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+ StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURBBi:
+ return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+ StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRHHui:
+ return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+ StOpc == AArch64::STRXui;
+ case AArch64::LDURHHi:
+ return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+ StOpc == AArch64::STURXi;
+ case AArch64::LDRWui:
+ return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURWi:
+ return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRXui:
+ return StOpc == AArch64::STRXui;
+ case AArch64::LDURXi:
+ return StOpc == AArch64::STURXi;
}
}
@@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::STRDpre;
case AArch64::STRQui:
return AArch64::STRQpre;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpre;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpre;
case AArch64::STRWui:
return AArch64::STRWpre;
case AArch64::STRXui:
@@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::LDRDpre;
case AArch64::LDRQui:
return AArch64::LDRQpre;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpre;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpre;
case AArch64::LDRWui:
return AArch64::LDRWpre;
case AArch64::LDRXui:
return AArch64::LDRXpre;
case AArch64::LDRSWui:
return AArch64::LDRSWpre;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpre;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpre;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpre;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpre;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpre;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpre;
+ case AArch64::STPSi:
+ return AArch64::STPSpre;
+ case AArch64::STPDi:
+ return AArch64::STPDpre;
+ case AArch64::STPQi:
+ return AArch64::STPQpre;
+ case AArch64::STPWi:
+ return AArch64::STPWpre;
+ case AArch64::STPXi:
+ return AArch64::STPXpre;
}
}
@@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::STRDpost;
case AArch64::STRQui:
return AArch64::STRQpost;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpost;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpost;
case AArch64::STRWui:
return AArch64::STRWpost;
case AArch64::STRXui:
@@ -316,19 +527,111 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::LDRDpost;
case AArch64::LDRQui:
return AArch64::LDRQpost;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpost;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpost;
case AArch64::LDRWui:
return AArch64::LDRWpost;
case AArch64::LDRXui:
return AArch64::LDRXpost;
case AArch64::LDRSWui:
return AArch64::LDRSWpost;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpost;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpost;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpost;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpost;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpost;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpost;
+ case AArch64::STPSi:
+ return AArch64::STPSpost;
+ case AArch64::STPDi:
+ return AArch64::STPDpost;
+ case AArch64::STPQi:
+ return AArch64::STPQpost;
+ case AArch64::STPWi:
+ return AArch64::STPWpost;
+ case AArch64::STPXi:
+ return AArch64::STPXpost;
}
}
+static bool isPairedLdSt(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ return true;
+ }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+ unsigned PairedRegOp = 0) {
+ assert(PairedRegOp < 2 && "Unexpected register operand idx.");
+ unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+ return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+ return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+ return MI->getOperand(Idx);
+}
+
+static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+ int LoadSize = getMemScale(LoadInst);
+ int StoreSize = getMemScale(StoreInst);
+ int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+ ? getLdStOffsetOp(StoreInst).getImm()
+ : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+ ? getLdStOffsetOp(LoadInst).getImm()
+ : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ return (UnscaledStOffset <= UnscaledLdOffset) &&
+ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
+// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
+static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
+ MachineInstr *Op1) {
+ assert(MI->memoperands_empty() && "expected a new machineinstr");
+ size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
+ (Op1->memoperands_end() - Op1->memoperands_begin());
+
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
+ MachineSDNode::mmo_iterator MemEnd =
+ std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
+ MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
+ MI->setMemRefs(MemBegin, MemEnd);
+}
+
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- bool MergeForward, int SExtIdx) {
+ const LdStPairFlags &Flags) {
MachineBasicBlock::iterator NextI = I;
++NextI;
// If NextI is the second of the two instructions to be merged, we need
@@ -338,25 +641,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
if (NextI == Paired)
++NextI;
+ int SExtIdx = Flags.getSExtIdx();
unsigned Opc =
SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
- bool IsUnscaled = isUnscaledLdst(Opc);
- int OffsetStride =
- IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+ bool IsUnscaled = isUnscaledLdSt(Opc);
+ int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
+ bool MergeForward = Flags.getMergeForward();
unsigned NewOpc = getMatchingPairOpcode(Opc);
// Insert our new paired instruction after whichever of the paired
// instructions MergeForward indicates.
MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
// Also based on MergeForward is from where we copy the base register operand
// so we get the flags compatible with the input code.
- MachineOperand &BaseRegOp =
- MergeForward ? Paired->getOperand(1) : I->getOperand(1);
+ const MachineOperand &BaseRegOp =
+ MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
// Which register is Rt and which is Rt2 depends on the offset order.
MachineInstr *RtMI, *Rt2MI;
- if (I->getOperand(2).getImm() ==
- Paired->getOperand(2).getImm() + OffsetStride) {
+ if (getLdStOffsetOp(I).getImm() ==
+ getLdStOffsetOp(Paired).getImm() + OffsetStride) {
RtMI = Paired;
Rt2MI = I;
// Here we swapped the assumption made for SExtIdx.
@@ -368,18 +672,135 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
RtMI = I;
Rt2MI = Paired;
}
- // Handle Unscaled
- int OffsetImm = RtMI->getOperand(2).getImm();
- if (IsUnscaled && EnableAArch64UnscaledMemOp)
- OffsetImm /= OffsetStride;
+
+ int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+
+ if (isNarrowLoad(Opc)) {
+ // Change the scaled offset from small to large type.
+ if (!IsUnscaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+ MachineInstr *RtNewDest = MergeForward ? I : Paired;
+ // When merging small (< 32 bit) loads for big-endian targets, the order of
+ // the component parts gets swapped.
+ if (!Subtarget->isLittleEndian())
+ std::swap(RtMI, Rt2MI);
+ // Construct the new load instruction.
+ MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
+ NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(RtNewDest))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+
+ // Copy MachineMemOperands from the original loads.
+ concatenateMemOperands(NewMemMI, I, Paired);
+
+ DEBUG(
+ dbgs()
+ << "Creating the new load and extract. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Paired->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG((NewMemMI)->print(dbgs()));
+
+ int Width = getMemScale(I) == 1 ? 8 : 16;
+ int LSBLow = 0;
+ int LSBHigh = Width;
+ int ImmsLow = LSBLow + Width - 1;
+ int ImmsHigh = LSBHigh + Width - 1;
+ MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+ if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
+ // Create the bitfield extract for high bits.
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(Rt2MI)))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
+ // Create the bitfield extract for low bits.
+ if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+ // For unsigned, prefer to use AND for low bits.
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(ImmsLow);
+ } else {
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(RtMI)))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
+ }
+ } else {
+ // Create the bitfield extract for low bits.
+ if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+ // For unsigned, prefer to use AND for low bits.
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(ImmsLow);
+ } else {
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(RtMI)))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
+ }
+
+ // Create the bitfield extract for high bits.
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(Rt2MI)))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
+ }
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI1)->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI2)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+ return NextI;
+ }
// Construct the new instruction.
- MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
- I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(RtMI->getOperand(0))
- .addOperand(Rt2MI->getOperand(0))
- .addOperand(BaseRegOp)
- .addImm(OffsetImm);
+ MachineInstrBuilder MIB;
+ if (isNarrowStore(Opc)) {
+ // Change the scaled offset from small to large type.
+ if (!IsUnscaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+ MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(I))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+ // Copy MachineMemOperands from the original stores.
+ concatenateMemOperands(MIB, I, Paired);
+ } else {
+ // Handle Unscaled
+ if (IsUnscaled)
+ OffsetImm /= OffsetStride;
+ MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(RtMI))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+ }
+
(void)MIB;
// FIXME: Do we need/want to copy the mem operands from the source
@@ -439,13 +860,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI) {
+ MachineBasicBlock::iterator NextI = LoadI;
+ ++NextI;
+
+ int LoadSize = getMemScale(LoadI);
+ int StoreSize = getMemScale(StoreI);
+ unsigned LdRt = getLdStRegOp(LoadI).getReg();
+ unsigned StRt = getLdStRegOp(StoreI).getReg();
+ bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+ assert((IsStoreXReg ||
+ TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+ "Unexpected RegClass");
+
+ MachineInstr *BitExtMI;
+ if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
+ // Remove the load, if the destination register of the loads is the same
+ // register for stored value.
+ if (StRt == LdRt && LoadSize == 8) {
+ DEBUG(dbgs() << "Remove load instruction:\n ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ LoadI->eraseFromParent();
+ return NextI;
+ }
+ // Replace the load with a mov if the load and store are in the same size.
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+ .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+ .addReg(StRt)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // FIXME: Currently we disable this transformation in big-endian targets as
+ // performance and correctness are verified only in little-endian.
+ if (!Subtarget->isLittleEndian())
+ return NextI;
+ bool IsUnscaled = isUnscaledLdSt(LoadI);
+ assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+ assert(LoadSize <= StoreSize && "Invalid load size");
+ int UnscaledLdOffset = IsUnscaled
+ ? getLdStOffsetOp(LoadI).getImm()
+ : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+ int UnscaledStOffset = IsUnscaled
+ ? getLdStOffsetOp(StoreI).getImm()
+ : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+ int Width = LoadSize * 8;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
+ unsigned DestReg = IsStoreXReg
+ ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+ &AArch64::GPR64RegClass)
+ : LdRt;
+
+ assert((UnscaledLdOffset >= UnscaledStOffset &&
+ (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+ "Invalid offset");
+
+ Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ Imms = Immr + Width - 1;
+ if (UnscaledLdOffset == UnscaledStOffset) {
+ uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+ | ((Immr) << 6) // immr
+ | ((Imms) << 0) // imms
+ ;
+
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(AndMaskEncoded);
+ } else {
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(Immr)
+ .addImm(Imms);
+ }
+ }
+
+ DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ LoadI->eraseFromParent();
+ return NextI;
+}
+
/// trackRegDefsUses - Remember what registers the specified instruction uses
/// and modifies.
-static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
BitVector &UsedRegs,
const TargetRegisterInfo *TRI) {
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MI->getOperand(i);
+ for (const MachineOperand &MO : MI->operands()) {
if (MO.isRegMask())
ModifiedRegs.setBitsNotInMask(MO.getRegMask());
@@ -464,16 +984,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
}
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
- if (!IsUnscaled && (Offset > 63 || Offset < -64))
- return false;
- if (IsUnscaled) {
- // Convert the byte-offset used by unscaled into an "element" offset used
- // by the scaled pair load/store instructions.
- int ElemOffset = Offset / OffsetStride;
- if (ElemOffset > 63 || ElemOffset < -64)
- return false;
- }
- return true;
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ if (IsUnscaled)
+ Offset /= OffsetStride;
+
+ return Offset <= 63 && Offset >= -64;
}
// Do alignment, specialized to power of 2 and for signed ints,
@@ -507,12 +1023,65 @@ static bool mayAlias(MachineInstr *MIa,
return false;
}
+bool AArch64LoadStoreOpt::findMatchingStore(
+ MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI) {
+ MachineBasicBlock::iterator E = I->getParent()->begin();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr *FirstMI = I;
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+
+ // Track which registers have been modified and used between the first insn
+ // and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+ --MBBI;
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If the load instruction reads directly from the address to which the
+ // store instruction writes and the stored value is not modified, we can
+ // promote the load. Since we do not handle stores with pre-/post-index,
+ // it's unnecessary to check if BaseReg is modified by the store itself.
+ if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+ BaseReg == getLdStBaseOp(MI).getReg() &&
+ isLdOffsetInRangeOfSt(FirstMI, MI) &&
+ !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ StoreI = MBBI;
+ return true;
+ }
+
+ if (MI->isCall())
+ return false;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return false;
+
+ // If we encounter a store aliased with the load, return early.
+ if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+ return false;
+ }
+ return false;
+}
+
/// findMatchingInsn - Scan the instructions looking for a load/store that can
/// be combined with the current instruction into a load/store pair.
MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
- bool &MergeForward, int &SExtIdx,
- unsigned Limit) {
+ LdStPairFlags &Flags, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
MachineInstr *FirstMI = I;
@@ -520,21 +1089,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
unsigned Opc = FirstMI->getOpcode();
bool MayLoad = FirstMI->mayLoad();
- bool IsUnscaled = isUnscaledLdst(Opc);
- unsigned Reg = FirstMI->getOperand(0).getReg();
- unsigned BaseReg = FirstMI->getOperand(1).getReg();
- int Offset = FirstMI->getOperand(2).getImm();
+ bool IsUnscaled = isUnscaledLdSt(FirstMI);
+ unsigned Reg = getLdStRegOp(FirstMI).getReg();
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ int Offset = getLdStOffsetOp(FirstMI).getImm();
+ bool IsNarrowStore = isNarrowStore(Opc);
+
+ // For narrow stores, find only the case where the stored value is WZR.
+ if (IsNarrowStore && Reg != AArch64::WZR)
+ return E;
// Early exit if the first instruction modifies the base register.
// e.g., ldr x0, [x0]
- // Early exit if the offset if not possible to match. (6 bits of positive
- // range, plus allow an extra one in case we find a later insn that matches
- // with Offset-1
if (FirstMI->modifiesRegister(BaseReg, TRI))
return E;
- int OffsetStride =
- IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
- if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+
+ // Early exit if the offset if not possible to match. (6 bits of positive
+ // range, plus allow an extra one in case we find a later insn that matches
+ // with Offset-1)
+ int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+ if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
+ !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
return E;
// Track which registers have been modified and used between the first insn
@@ -557,18 +1132,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
++Count;
bool CanMergeOpc = Opc == MI->getOpcode();
- SExtIdx = -1;
+ Flags.setSExtIdx(-1);
if (!CanMergeOpc) {
bool IsValidLdStrOpc;
unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
- if (!IsValidLdStrOpc)
- continue;
+ assert(IsValidLdStrOpc &&
+ "Given Opc should be a Load or Store with an immediate");
// Opc will be the first instruction in the pair.
- SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
+ Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
}
- if (CanMergeOpc && MI->getOperand(2).isImm()) {
+ if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
+ assert(MI->mayLoadOrStore() && "Expected memory operation.");
// If we've found another instruction with the same opcode, check to see
// if the base and offset are compatible with our starting instruction.
// These instructions all have scaled immediate operands, so we just
@@ -579,8 +1155,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// Pairwise instructions have a 7-bit signed offset field. Single insns
// have a 12-bit unsigned offset field. To be a valid combine, the
// final offset must be in range.
- unsigned MIBaseReg = MI->getOperand(1).getReg();
- int MIOffset = MI->getOperand(2).getImm();
+ unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+ int MIOffset = getLdStOffsetOp(MI).getImm();
if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
(Offset + OffsetStride == MIOffset))) {
int MinOffset = Offset < MIOffset ? Offset : MIOffset;
@@ -591,30 +1167,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
return E;
// If the resultant immediate offset of merging these instructions
// is out of range for a pairwise instruction, bail and keep looking.
- bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
- if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+ bool MIIsUnscaled = isUnscaledLdSt(MI);
+ bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
+ if (!IsNarrowLoad &&
+ !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
- MemInsns.push_back(MI);
+ MemInsns.push_back(MI);
continue;
}
- // If the alignment requirements of the paired (scaled) instruction
- // can't express the offset of the unscaled input, bail and keep
- // looking.
- if (IsUnscaled && EnableAArch64UnscaledMemOp &&
- (alignTo(MinOffset, OffsetStride) != MinOffset)) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
+
+ if (IsNarrowLoad || IsNarrowStore) {
+ // If the alignment requirements of the scaled wide load/store
+ // instruction can't express the offset of the scaled narrow
+ // input, bail and keep looking.
+ if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
MemInsns.push_back(MI);
- continue;
+ continue;
+ }
+ } else {
+ // If the alignment requirements of the paired (scaled) instruction
+ // can't express the offset of the unscaled input, bail and keep
+ // looking.
+ if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(MI);
+ continue;
+ }
}
// If the destination register of the loads is the same register, bail
// and keep looking. A load-pair instruction with both destination
// registers the same is UNPREDICTABLE and will result in an exception.
- if (MayLoad && Reg == MI->getOperand(0).getReg()) {
+ // For narrow stores, allow only when the stored value is the same
+ // (i.e., WZR).
+ if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
+ (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
- MemInsns.push_back(MI);
+ MemInsns.push_back(MI);
continue;
}
@@ -622,10 +1211,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// the two instructions and none of the instructions between the second
// and first alias with the second, we can combine the second into the
// first.
- if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
- !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) &&
+ if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
+ !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
!mayAlias(MI, MemInsns, TII)) {
- MergeForward = false;
+ Flags.setMergeForward(false);
return MBBI;
}
@@ -633,11 +1222,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// between the two instructions and none of the instructions between the
// first and the second alias with the first, we can combine the first
// into the second.
- if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
- !(FirstMI->mayLoad() &&
- UsedRegs[FirstMI->getOperand(0).getReg()]) &&
+ if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
+ !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
!mayAlias(FirstMI, MemInsns, TII)) {
- MergeForward = true;
+ Flags.setMergeForward(true);
return MBBI;
}
// Unable to combine these instructions due to interference in between.
@@ -666,51 +1254,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
}
MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update) {
- assert((Update->getOpcode() == AArch64::ADDXri ||
- Update->getOpcode() == AArch64::SUBXri) &&
- "Unexpected base register update instruction to merge!");
- MachineBasicBlock::iterator NextI = I;
- // Return the instruction following the merged instruction, which is
- // the instruction following our unmerged load. Unless that's the add/sub
- // instruction we're merging, in which case it's the one after that.
- if (++NextI == Update)
- ++NextI;
-
- int Value = Update->getOperand(2).getImm();
- assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
- "Can't merge 1 << 12 offset into pre-indexed load / store");
- if (Update->getOpcode() == AArch64::SUBXri)
- Value = -Value;
-
- unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
- MachineInstrBuilder MIB =
- BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(Update->getOperand(0))
- .addOperand(I->getOperand(0))
- .addOperand(I->getOperand(1))
- .addImm(Value);
- (void)MIB;
-
- DEBUG(dbgs() << "Creating pre-indexed load/store.");
- DEBUG(dbgs() << " Replacing instructions:\n ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(Update->print(dbgs()));
- DEBUG(dbgs() << " with instruction:\n ");
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
-
- // Erase the old instructions for the block.
- I->eraseFromParent();
- Update->eraseFromParent();
-
- return NextI;
-}
-
-MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
- MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update,
+ bool IsPreIdx) {
assert((Update->getOpcode() == AArch64::ADDXri ||
Update->getOpcode() == AArch64::SUBXri) &&
"Unexpected base register update instruction to merge!");
@@ -723,20 +1269,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
int Value = Update->getOperand(2).getImm();
assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
- "Can't merge 1 << 12 offset into post-indexed load / store");
+ "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
if (Update->getOpcode() == AArch64::SUBXri)
Value = -Value;
- unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
- MachineInstrBuilder MIB =
- BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(Update->getOperand(0))
- .addOperand(I->getOperand(0))
- .addOperand(I->getOperand(1))
- .addImm(Value);
+ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
+ : getPostIndexedOpcode(I->getOpcode());
+ MachineInstrBuilder MIB;
+ if (!isPairedLdSt(I)) {
+ // Non-paired instruction.
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(Update))
+ .addOperand(getLdStRegOp(I))
+ .addOperand(getLdStBaseOp(I))
+ .addImm(Value);
+ } else {
+ // Paired instruction.
+ int Scale = getMemScale(I);
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(Update))
+ .addOperand(getLdStRegOp(I, 0))
+ .addOperand(getLdStRegOp(I, 1))
+ .addOperand(getLdStBaseOp(I))
+ .addImm(Value / Scale);
+ }
(void)MIB;
- DEBUG(dbgs() << "Creating post-indexed load/store.");
+ if (IsPreIdx)
+ DEBUG(dbgs() << "Creating pre-indexed load/store.");
+ else
+ DEBUG(dbgs() << "Creating post-indexed load/store.");
DEBUG(dbgs() << " Replacing instructions:\n ");
DEBUG(I->print(dbgs()));
DEBUG(dbgs() << " ");
@@ -752,8 +1314,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
return NextI;
}
-static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
- int Offset) {
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
+ MachineInstr *MI,
+ unsigned BaseReg, int Offset) {
switch (MI->getOpcode()) {
default:
break;
@@ -769,44 +1332,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
// Watch out for 1 << 12 shifted value.
if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
break;
- // If the instruction has the base register as source and dest and the
- // immediate will fit in a signed 9-bit integer, then we have a match.
- if (MI->getOperand(0).getReg() == BaseReg &&
- MI->getOperand(1).getReg() == BaseReg &&
- MI->getOperand(2).getImm() <= 255 &&
- MI->getOperand(2).getImm() >= -256) {
- // If we have a non-zero Offset, we check that it matches the amount
- // we're adding to the register.
- if (!Offset || Offset == MI->getOperand(2).getImm())
- return true;
+
+ // The update instruction source and destination register must be the
+ // same as the load/store base register.
+ if (MI->getOperand(0).getReg() != BaseReg ||
+ MI->getOperand(1).getReg() != BaseReg)
+ break;
+
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ int UpdateOffset = MI->getOperand(2).getImm();
+ // For non-paired load/store instructions, the immediate must fit in a
+ // signed 9-bit integer.
+ if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+ break;
+
+ // For paired load/store instructions, the immediate must be a multiple of
+ // the scaling factor. The scaled offset must also fit into a signed 7-bit
+ // integer.
+ if (IsPairedInsn) {
+ int Scale = getMemScale(MemMI);
+ if (UpdateOffset % Scale != 0)
+ break;
+
+ int ScaledOffset = UpdateOffset / Scale;
+ if (ScaledOffset > 64 || ScaledOffset < -64)
+ break;
}
+
+ // If we have a non-zero Offset, we check that it matches the amount
+ // we're adding to the register.
+ if (!Offset || Offset == MI->getOperand(2).getImm())
+ return true;
break;
}
return false;
}
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
- MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+ MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr *MemMI = I;
MachineBasicBlock::iterator MBBI = I;
- const MachineFunction &MF = *MemMI->getParent()->getParent();
- unsigned DestReg = MemMI->getOperand(0).getReg();
- unsigned BaseReg = MemMI->getOperand(1).getReg();
- int Offset = MemMI->getOperand(2).getImm() *
- TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
- // If the base register overlaps the destination register, we can't
- // merge the update.
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ // Scan forward looking for post-index opportunities. Updating instructions
+ // can't be formed if the memory instruction doesn't have the offset we're
+ // looking for.
+ if (MIUnscaledOffset != UnscaledOffset)
return E;
- // Scan forward looking for post-index opportunities.
- // Updating instructions can't be formed if the memory insn already
- // has an offset other than the value we're looking for.
- if (Offset != Value)
- return E;
+ // If the base register overlaps a destination register, we can't
+ // merge the update.
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
@@ -825,7 +1409,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(MI, BaseReg, Value))
+ if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -845,21 +1429,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr *MemMI = I;
MachineBasicBlock::iterator MBBI = I;
- const MachineFunction &MF = *MemMI->getParent()->getParent();
- unsigned DestReg = MemMI->getOperand(0).getReg();
- unsigned BaseReg = MemMI->getOperand(1).getReg();
- int Offset = MemMI->getOperand(2).getImm();
- unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int Offset = getLdStOffsetOp(MemMI).getImm();
// If the load/store is the first instruction in the block, there's obviously
// not any matching update. Ditto if the memory offset isn't zero.
if (MBBI == B || Offset != 0)
return E;
- // If the base register overlaps the destination register, we can't
+ // If the base register overlaps a destination register, we can't
// merge the update.
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
@@ -878,7 +1463,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+ if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -892,17 +1477,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
-bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ // If this is a volatile load, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm.
+ // FIXME: It is possible to extend it to handle reg+reg cases.
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Look backward up to ScanLimit instructions.
+ MachineBasicBlock::iterator StoreI;
+ if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+ ++NumLoadsFromStoresPromoted;
+ // Promote the load. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = promoteLoadFromStore(MBBI, StoreI);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64LoadStoreOpt::tryToMergeLdStInst(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ MachineBasicBlock::iterator E = MI->getParent()->end();
+ // If this is a volatile load/store, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (TII->isLdStPairSuppressed(MI))
+ return false;
+
+ // Look ahead up to ScanLimit instructions for a pairable instruction.
+ LdStPairFlags Flags;
+ MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
+ if (Paired != E) {
+ if (isNarrowLoad(MI)) {
+ ++NumNarrowLoadsPromoted;
+ } else if (isNarrowStore(MI)) {
+ ++NumZeroStoresPromoted;
+ } else {
+ ++NumPairCreated;
+ if (isUnscaledLdSt(MI))
+ ++NumUnscaledPairCreated;
+ }
+
+ // Merge the loads into a pair. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = mergePairedInsns(MBBI, Paired, Flags);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+ bool enableNarrowLdOpt) {
bool Modified = false;
- // Two tranformations to do here:
- // 1) Find loads and stores that can be merged into a single load or store
+ // Three tranformations to do here:
+ // 1) Find loads that directly read from stores and promote them by
+ // replacing with mov instructions. If the store is wider than the load,
+ // the load will be replaced with a bitfield extract.
+ // e.g.,
+ // str w1, [x0, #4]
+ // ldrh w2, [x0, #6]
+ // ; becomes
+ // str w1, [x0, #4]
+ // lsr w2, w1, #16
+ // 2) Find narrow loads that can be converted into a single wider load
+ // with bitfield extract instructions.
+ // e.g.,
+ // ldrh w0, [x2]
+ // ldrh w1, [x2, #2]
+ // ; becomes
+ // ldr w0, [x2]
+ // ubfx w1, w0, #16, #16
+ // and w0, w0, #ffff
+ // 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
- // 2) Find base register updates that can be merged into the load or store
+ // 4) Find base register updates that can be merged into the load or store
// as a base-reg writeback.
// e.g.,
// ldr x0, [x2]
@@ -918,6 +1587,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
// Just move on to the next instruction.
++MBBI;
break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ if (tryToPromoteLoadFromStore(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ enableNarrowLdOpt && MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSHWui:
+ case AArch64::STRBBui:
+ case AArch64::STRHHui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi: {
+ if (tryToMergeLdStInst(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
@@ -929,7 +1661,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDRXui:
case AArch64::LDRWui:
case AArch64::LDRSWui:
- // do the unscaled versions as well
+ // Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
@@ -941,37 +1673,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDURWi:
case AArch64::LDURXi:
case AArch64::LDURSWi: {
- // If this is a volatile load/store, don't mess with it.
- if (MI->hasOrderedMemoryRef()) {
- ++MBBI;
- break;
- }
- // Make sure this is a reg+imm (as opposed to an address reloc).
- if (!MI->getOperand(2).isImm()) {
- ++MBBI;
- break;
- }
- // Check if this load/store has a hint to avoid pair formation.
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
- if (TII->isLdStPairSuppressed(MI)) {
- ++MBBI;
- break;
- }
- // Look ahead up to ScanLimit instructions for a pairable instruction.
- bool MergeForward = false;
- int SExtIdx = -1;
- MachineBasicBlock::iterator Paired =
- findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit);
- if (Paired != E) {
- // Merge the loads into a pair. Keeping the iterator straight is a
- // pain, so we let the merge routine tell us what the next instruction
- // is after it's done mucking about.
- MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx);
-
+ if (tryToMergeLdStInst(MBBI)) {
Modified = true;
- ++NumPairCreated;
- if (isUnscaledLdst(MI->getOpcode()))
- ++NumUnscaledPairCreated;
break;
}
++MBBI;
@@ -992,17 +1695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
// Just move on to the next instruction.
++MBBI;
break;
+ // Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
case AArch64::STRXui:
case AArch64::STRWui:
+ case AArch64::STRHHui:
+ case AArch64::STRBBui:
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
case AArch64::LDRXui:
case AArch64::LDRWui:
- // do the unscaled versions as well
+ case AArch64::LDRHHui:
+ case AArch64::LDRBBui:
+ // Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
@@ -1012,25 +1720,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDURDi:
case AArch64::LDURQi:
case AArch64::LDURWi:
- case AArch64::LDURXi: {
+ case AArch64::LDURXi:
+ // Paired instructions.
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi: {
// Make sure this is a reg+imm (as opposed to an address reloc).
- if (!MI->getOperand(2).isImm()) {
+ if (!getLdStOffsetOp(MI).isImm()) {
++MBBI;
break;
}
- // Look ahead up to ScanLimit instructions for a mergable instruction.
+ // Look forward to try to form a post-index instruction. For example,
+ // ldr x0, [x20]
+ // add x20, x20, #32
+ // merged into:
+ // ldr x0, [x20], #32
MachineBasicBlock::iterator Update =
findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
Modified = true;
++NumPostFolded;
break;
}
// Don't know how to handle pre/post-index versions, so move to the next
// instruction.
- if (isUnscaledLdst(Opc)) {
+ if (isUnscaledLdSt(Opc)) {
++MBBI;
break;
}
@@ -1043,28 +1767,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
Modified = true;
++NumPreFolded;
break;
}
+ // The immediate in the load/store is scaled by the size of the memory
+ // operation. The immediate in the add we're looking for,
+ // however, is not, so adjust here.
+ int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
// Look forward to try to find a post-index instruction. For example,
// ldr x1, [x0, #64]
// add x0, x0, #64
// merged into:
// ldr x1, [x0, #64]!
-
- // The immediate in the load/store is scaled by the size of the register
- // being loaded. The immediate in the add we're looking for,
- // however, is not, so adjust here.
- int Value = MI->getOperand(2).getImm() *
- TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
- ->getSize();
- Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+ Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
Modified = true;
++NumPreFolded;
break;
@@ -1081,13 +1802,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
return Modified;
}
+bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
+ bool ProfitableArch = Subtarget->isCortexA57();
+ // FIXME: The benefit from converting narrow loads into a wider load could be
+ // microarchitectural as it assumes that a single load with two bitfield
+ // extracts is cheaper than two narrow loads. Currently, this conversion is
+ // enabled only in cortex-a57 on which performance benefits were verified.
+ return ProfitableArch && !Subtarget->requiresStrictAlign();
+}
+
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
- TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
- TRI = Fn.getSubtarget().getRegisterInfo();
+ Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+ TRI = Subtarget->getRegisterInfo();
bool Modified = false;
+ bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
for (auto &MBB : Fn)
- Modified |= optimizeBlock(MBB);
+ Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
return Modified;
}
@@ -1095,8 +1827,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
// loads and stores near one another?
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
+/// createAArch64LoadStoreOptimizationPass - returns an instance of the
+/// load / store optimization pass.
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
return new AArch64LoadStoreOpt();
}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 580427ab3cc1..2b4cdf1083be 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ for (const MachineOperand &MO : MI->operands()) {
MCOperand MCOp;
- if (lowerOperand(MI->getOperand(i), MCOp))
+ if (lowerOperand(MO, MCOp))
OutMI.addOperand(MCOp);
}
}
diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
deleted file mode 100644
index 4164b3364559..000000000000
--- a/lib/Target/AArch64/AArch64MachineCombinerPattern.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- AArch64MachineCombinerPattern.h -===//
-//===- AArch64 instruction pattern supported by combiner -===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines instruction pattern supported by combiner
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-
-namespace llvm {
-
-/// Enumeration of instruction pattern supported by machine combiner
-///
-///
-namespace MachineCombinerPattern {
-enum MC_PATTERN : int {
- MC_NONE = 0,
- MC_MULADDW_OP1 = 1,
- MC_MULADDW_OP2 = 2,
- MC_MULSUBW_OP1 = 3,
- MC_MULSUBW_OP2 = 4,
- MC_MULADDWI_OP1 = 5,
- MC_MULSUBWI_OP1 = 6,
- MC_MULADDX_OP1 = 7,
- MC_MULADDX_OP2 = 8,
- MC_MULSUBX_OP1 = 9,
- MC_MULSUBX_OP2 = 10,
- MC_MULADDXI_OP1 = 11,
- MC_MULSUBXI_OP1 = 12
-};
-} // end namespace MachineCombinerPattern
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 536a8d0f97a0..318f83953505 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
+//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
unsigned ArgumentStackToRestore;
/// HasStackFrame - True if this function has a stack frame. Set by
- /// processFunctionBeforeCalleeSavedScan().
+ /// determineCalleeSaves().
bool HasStackFrame;
/// \brief Amount of stack frame size, not including callee-saved registers.
@@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
/// registers.
unsigned VarArgsFPRSize;
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR;
+
public:
AArch64FunctionInfo()
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
- VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false) {}
explicit AArch64FunctionInfo(MachineFunction &MF)
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
- VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false) {
(void)MF;
}
@@ -96,6 +102,9 @@ public:
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
unsigned getLocalStackSize() const { return LocalStackSize; }
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index e1b93bf07c89..79c09d9f058d 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions(
for (const auto &IPI : InsertPts) {
// Create the load of the global variable.
- IRBuilder<> Builder(IPI.first->getParent(), IPI.first);
+ IRBuilder<> Builder(IPI.first);
LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
DEBUG(dbgs() << "**********\n");
DEBUG(dbgs() << "New def: ");
@@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
bool LocalChange = false;
SmallPtrSet<Constant *, 8> AlreadyChecked;
- for (Instruction &I : inst_range(&F)) {
+ for (Instruction &I : instructions(&F)) {
// Traverse the operand, looking for constant vectors. Replace them by a
// load of a global variable of constant vector type.
for (Value *Op : I.operand_values()) {
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 841af55f7a65..32b4888f2f64 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
#include "AArch64RegisterInfo.h"
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -34,10 +35,6 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"
-static cl::opt<bool>
-ReserveX18("aarch64-reserve-x18", cl::Hidden,
- cl::desc("Reserve X18, making it unavailable as GPR"));
-
AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {}
@@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_NoRegs_SaveList;
if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+ return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
+ CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
+ CSR_AArch64_CXX_TLS_Darwin_SaveList;
else
return CSR_AArch64_AAPCS_SaveList;
}
+const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
+ return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return nullptr;
+}
+
const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
@@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_RegMask;
+ if (CC == CallingConv::CXX_FAST_TLS)
+ return CSR_AArch64_CXX_TLS_Darwin_RegMask;
else
return CSR_AArch64_AAPCS_RegMask;
}
@@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AArch64::W29);
}
- if (TT.isOSDarwin() || ReserveX18) {
+ if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
Reserved.set(AArch64::X18); // Platform register
Reserved.set(AArch64::W18);
}
@@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
return true;
case AArch64::X18:
case AArch64::W18:
- return TT.isOSDarwin() || ReserveX18;
+ return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
case AArch64::FP:
case AArch64::W29:
return TFI->hasFP(MF) || TT.isOSDarwin();
@@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return false;
}
-bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const {
-
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
- return false;
-
- return true;
-}
-
-// FIXME: share this with other backends with identical implementation?
-bool
-AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64FrameLowering *TFI = getFrameLowering(MF);
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment =
- ((MFI->getMaxAlignment() > StackAlign) ||
- F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::StackAlignment));
-
- return requiresRealignment && canRealignStack(MF);
-}
-
unsigned
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
case AArch64::GPR64RegClassID:
case AArch64::GPR32commonRegClassID:
case AArch64::GPR64commonRegClassID:
- return 32 - 1 // XZR/SP
- - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
- - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register
- - hasBasePointer(MF); // X19
+ return 32 - 1 // XZR/SP
+ - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
+ - MF.getSubtarget<AArch64Subtarget>()
+ .isX18Reserved() // X18 reserved as platform register
+ - hasBasePointer(MF); // X19
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 8c379d926108..f33f788fd437 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -35,6 +35,8 @@ public:
/// Code Generation virtual methods...
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
@@ -93,9 +95,6 @@ public:
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- // Base pointer (stack realignment) support.
- bool canRealignStack(const MachineFunction &MF) const;
- bool needsStackRealignment(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index b2efca023372..a8c8b176efa9 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64",
// The lower 16 vector registers. Some instructions can only take registers
// in this range.
def FPR128_lo : RegisterClass<"AArch64",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
128, (trunc FPR128, 16)>;
// Pairs, triples, and quads of 64-bit vector registers.
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 486efd6ce3a2..f6ee8cf47a6a 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -31,6 +31,11 @@ static cl::opt<bool>
EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
"converter pass"), cl::init(true), cl::Hidden);
+// If OS supports TBI, use this flag to enable it.
+static cl::opt<bool>
+UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
+ "an address is ignored"), cl::init(false), cl::Hidden);
+
AArch64Subtarget &
AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
// Determine default and user-specified characteristics
@@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
: AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
- HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false),
- HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
- IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
+ HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
+ HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
+ HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
+ StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
+ CPUString(CPU), TargetTriple(TT), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
TLInfo(TM, *this) {}
@@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
Policy.OnlyBottomUp = false;
+ // Enabling or Disabling the latency heuristic is a close call: It seems to
+ // help nearly no benchmark on out-of-order architectures, on the other hand
+ // it regresses register pressure on a few benchmarking.
+ if (isCyclone())
+ Policy.DisableLatencyHeuristic = true;
}
bool AArch64Subtarget::enableEarlyIfConversion() const {
return EnableEarlyIfConvert;
}
+bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
+ if (!UseAddressTopByteIgnored)
+ return false;
+
+ if (TargetTriple.isiOS()) {
+ unsigned Major, Minor, Micro;
+ TargetTriple.getiOSVersion(Major, Minor, Micro);
+ return Major >= 8;
+ }
+
+ return false;
+}
+
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
if (!isCortexA57())
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 6bb069423060..1b8b9b27719c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,17 +33,21 @@ class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
protected:
- enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+ enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily;
bool HasV8_1aOps;
+ bool HasV8_2aOps;
bool HasFPARMv8;
bool HasNEON;
bool HasCrypto;
bool HasCRC;
+ bool HasPerfMon;
+ bool HasFullFP16;
+ bool HasSPE;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove;
@@ -51,6 +55,12 @@ protected:
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
bool HasZeroCycleZeroing;
+ // StrictAlign - Disallow unaligned memory accesses.
+ bool StrictAlign;
+
+ // ReserveX18 - X18 is not available as a general purpose register.
+ bool ReserveX18;
+
bool IsLittle;
/// CPUString - String name of used CPU.
@@ -92,19 +102,30 @@ public:
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
- return isCortexA53() || isCortexA57();
+ return isGeneric() || isCortexA53() || isCortexA57();
}
bool hasV8_1aOps() const { return HasV8_1aOps; }
+ bool hasV8_2aOps() const { return HasV8_2aOps; }
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+ bool requiresStrictAlign() const { return StrictAlign; }
+
+ bool isX18Reserved() const { return ReserveX18; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
+ /// CPU has TBI (top byte of addresses is ignored during HW address
+ /// translation) and OS enables it.
+ bool supportsAddressTopByteIgnored() const;
+
+ bool hasPerfMon() const { return HasPerfMon; }
+ bool hasFullFP16() const { return HasFullFP16; }
+ bool hasSPE() const { return HasSPE; }
bool isLittleEndian() const { return IsLittle; }
@@ -112,11 +133,13 @@ public:
bool isTargetIOS() const { return TargetTriple.isiOS(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+ bool isGeneric() const { return CPUString == "generic"; }
bool isCyclone() const { return CPUString == "cyclone"; }
bool isCortexA57() const { return CPUString == "cortex-a57"; }
bool isCortexA53() const { return CPUString == "cortex-a53"; }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index db6e244337a7..c52c5544fc7e 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -203,7 +203,7 @@ public:
} // namespace
TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(AArch64TTIImpl(this, F));
});
}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e085cca35f1c..9af0e6444789 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
return 0;
@@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
}
/// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- unsigned Cost = 0;
+ int Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
- return std::max(1U, Cost);
+ return std::max(1, Cost);
}
-unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
}
if (Idx == ImmIdx) {
- unsigned NumConstants = (BitSize + 63) / 64;
- unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
- ? static_cast<unsigned>(TTI::TCC_Free)
+ ? static_cast<int>(TTI::TCC_Free)
: Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
}
-unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
- unsigned NumConstants = (BitSize + 63) / 64;
- unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
- ? static_cast<unsigned>(TTI::TCC_Free)
+ ? static_cast<int>(TTI::TCC_Free)
: Cost;
}
break;
@@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
- Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
- static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+ static const TypeConversionCostTblEntry
+ ConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+ // The number of shll instructions for the extension.
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
// LowerVectorINT_TO_FP:
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
@@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ // Complex: to v8f32
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+ // Complex: to v16f32
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
// Complex: to v2f64
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
@@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
};
- int Idx = ConvertCostTableLookup<MVT>(
- ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
- SrcTy.getSimpleVT());
- if (Idx != -1)
- return ConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
// All other insert/extracts cost this much.
- return 2;
+ return 3;
}
-unsigned AArch64TTIImpl::getArithmeticInstrCost(
+int AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- unsigned Cost =
- getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
}
}
-unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
return 1;
}
-unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
- // We don't lower vector selects well that are wider than the register width.
+ // We don't lower some vector selects well that are wider than the register
+ // width.
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
// We would need this many instructions to hide the scalarization happening.
- const unsigned AmortizationCost = 20;
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+ const int AmortizationCost = 20;
+ static const TypeConversionCostTblEntry
VectorSelectTbl[] = {
- { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
- { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
- { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
@@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
EVT SelCondTy = TLI->getValueType(DL, CondTy);
EVT SelValTy = TLI->getValueType(DL, ValTy);
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
- int Idx =
- ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
- SelValTy.getSimpleVT());
- if (Idx != -1)
- return VectorSelectTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+ SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT()))
+ return Entry->Cost;
}
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment, unsigned AddressSpace) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isIntegerTy(64)) {
@@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// practice on inlined memcpy code.
// We make v2i64 stores expensive so that we will only vectorize if there
// are 6 other instructions getting vectorized.
- unsigned AmortizationCost = 6;
+ int AmortizationCost = 6;
return LT.first * 2 * AmortizationCost;
}
@@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
-unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
- unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
- unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
@@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
Alignment, AddressSpace);
}
-unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
- unsigned Cost = 0;
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+ int Cost = 0;
for (auto *I : Tys) {
if (!I->isVectorTy())
continue;
@@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
- Info.Vol = false;
+ Info.IsSimple = true;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(0);
break;
@@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
- Info.Vol = false;
+ Info.IsSimple = true;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 444d3ccc15e1..ec58c4fe309f 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
};
public:
- explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
+ explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -63,12 +63,11 @@ public:
/// @{
using BaseT::getIntImmCost;
- unsigned getIntImmCost(int64_t Val);
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
- unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
- unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ int getIntImmCost(int64_t Val);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
@@ -76,6 +75,8 @@ public:
/// \name Vector TTI Implementations
/// @{
+ bool enableInterleavedAccessVectorization() { return true; }
+
unsigned getNumberOfRegisters(bool Vector) {
if (Vector) {
if (ST->hasNEON())
@@ -96,25 +97,25 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
- unsigned getAddressComputationCost(Type *Ty, bool IsComplex);
+ int getAddressComputationCost(Type *Ty, bool IsComplex);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
- unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+ int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
@@ -123,11 +124,9 @@ public:
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
- unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38e8b4d9a938..394c8e78581f 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -43,7 +43,6 @@ class AArch64Operand;
class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
- MCSubtargetInfo &STI;
// Map of register aliases registers via the .req directive.
StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -101,6 +100,7 @@ private:
OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+ OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
@@ -115,16 +115,16 @@ public:
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "AArch64GenAsmMatcher.inc"
};
- AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser,
+ AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(STI) {
+ : MCTargetAsmParser(Options, STI) {
MCAsmParserExtension::Initialize(Parser);
MCStreamer &S = getParser().getStreamer();
if (S.getTargetStreamer() == nullptr)
new AArch64TargetStreamer(S);
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -160,7 +160,8 @@ private:
k_Prefetch,
k_ShiftExtend,
k_FPImm,
- k_Barrier
+ k_Barrier,
+ k_PSBHint,
} Kind;
SMLoc StartLoc, EndLoc;
@@ -228,6 +229,12 @@ private:
unsigned Length;
};
+ struct PSBHintOp {
+ unsigned Val;
+ const char *Data;
+ unsigned Length;
+ };
+
struct ShiftExtendOp {
AArch64_AM::ShiftExtendType Type;
unsigned Amount;
@@ -251,6 +258,7 @@ private:
struct SysRegOp SysReg;
struct SysCRImmOp SysCRImm;
struct PrefetchOp Prefetch;
+ struct PSBHintOp PSBHint;
struct ShiftExtendOp ShiftExtend;
};
@@ -302,6 +310,9 @@ public:
case k_Prefetch:
Prefetch = o.Prefetch;
break;
+ case k_PSBHint:
+ PSBHint = o.PSBHint;
+ break;
case k_ShiftExtend:
ShiftExtend = o.ShiftExtend;
break;
@@ -393,6 +404,16 @@ public:
return Prefetch.Val;
}
+ unsigned getPSBHint() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return PSBHint.Val;
+ }
+
+ StringRef getPSBHintName() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return StringRef(PSBHint.Data, PSBHint.Length);
+ }
+
StringRef getPrefetchName() const {
assert(Kind == k_Prefetch && "Invalid access!");
return StringRef(Prefetch.Data, Prefetch.Length);
@@ -497,6 +518,15 @@ public:
return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
}
+ bool isImm0_1() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 2);
+ }
bool isImm0_7() const {
if (!isImm())
return false;
@@ -876,12 +906,15 @@ public:
}
bool isMSRSystemRegister() const {
if (!isSysReg()) return false;
-
return SysReg.MSRReg != -1U;
}
- bool isSystemPStateField() const {
+ bool isSystemPStateFieldWithImm0_1() const {
if (!isSysReg()) return false;
-
+ return (SysReg.PStateField == AArch64PState::PAN ||
+ SysReg.PStateField == AArch64PState::UAO);
+ }
+ bool isSystemPStateFieldWithImm0_15() const {
+ if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
return SysReg.PStateField != -1U;
}
bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
@@ -950,6 +983,7 @@ public:
}
bool isSysCR() const { return Kind == k_SysCR; }
bool isPrefetch() const { return Kind == k_Prefetch; }
+ bool isPSBHint() const { return Kind == k_PSBHint; }
bool isShiftExtend() const { return Kind == k_ShiftExtend; }
bool isShifter() const {
if (!isShiftExtend())
@@ -1175,8 +1209,10 @@ public:
template <unsigned NumRegs>
void addVectorList64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1,
- AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+ static const unsigned FirstRegs[] = { AArch64::D0,
+ AArch64::D0_D1,
+ AArch64::D0_D1_D2,
+ AArch64::D0_D1_D2_D3 };
unsigned FirstReg = FirstRegs[NumRegs - 1];
Inst.addOperand(
@@ -1186,8 +1222,10 @@ public:
template <unsigned NumRegs>
void addVectorList128Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1,
- AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+ static const unsigned FirstRegs[] = { AArch64::Q0,
+ AArch64::Q0_Q1,
+ AArch64::Q0_Q1_Q2,
+ AArch64::Q0_Q1_Q2_Q3 };
unsigned FirstReg = FirstRegs[NumRegs - 1];
Inst.addOperand(
@@ -1304,6 +1342,12 @@ public:
Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
}
+ void addImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
void addImm0_7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
@@ -1491,7 +1535,13 @@ public:
Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
}
- void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+ void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+ }
+
+ void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
@@ -1507,6 +1557,11 @@ public:
Inst.addOperand(MCOperand::createImm(getPrefetch()));
}
+ void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getPSBHint()));
+ }
+
void addShifterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
unsigned Imm =
@@ -1703,6 +1758,19 @@ public:
return Op;
}
+ static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+ StringRef Str,
+ SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+ Op->PSBHint.Val = Val;
+ Op->PSBHint.Data = Str.data();
+ Op->PSBHint.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
OS << "<prfop invalid #" << getPrefetch() << ">";
break;
}
+ case k_PSBHint: {
+ OS << getPSBHintName();
+ break;
+ }
case k_ShiftExtend: {
OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
<< getShiftExtendAmount();
@@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) {
.Case(".h", true)
.Case(".s", true)
.Case(".d", true)
+ // Needed for fp16 scalar pairwise reductions
+ .Case(".2h", true)
.Default(false);
}
@@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64PRFM::PRFMMapper();
StringRef Name =
- Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+ Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
S, getContext()));
return MatchOperand_Success;
@@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64PRFM::PRFMMapper();
unsigned prfop =
- Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
if (!Valid) {
TokError("pre-fetch hint expected");
return MatchOperand_ParseFail;
@@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_Success;
}
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ bool Valid;
+ auto Mapper = AArch64PSBHint::PSBHintMapper();
+ unsigned psbhint =
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
+ if (!Valid) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
+ S, getContext()));
+ return MatchOperand_Success;
+}
+
/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
/// instruction.
AArch64AsmParser::OperandMatchResultTy
@@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
} else if (!Op.compare_lower("cisw")) {
// SYS #0, C7, C14, #2
SYS_ALIAS(0, 7, 14, 2);
+ } else if (!Op.compare_lower("cvap")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #3, C7, C12, #1
+ SYS_ALIAS(3, 7, 12, 1);
+ } else {
+ return TokError("DC CVAP requires ARMv8.2a");
+ }
} else {
return TokError("invalid operand for DC instruction");
}
@@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
} else if (!Op.compare_lower("s12e0w")) {
// SYS #4, C7, C8, #7
SYS_ALIAS(4, 7, 8, 7);
+ } else if (!Op.compare_lower("s1e1rp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #0
+ SYS_ALIAS(0, 7, 9, 0);
+ } else {
+ return TokError("AT S1E1RP requires ARMv8.2a");
+ }
+ } else if (!Op.compare_lower("s1e1wp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #1
+ SYS_ALIAS(0, 7, 9, 1);
+ } else {
+ return TokError("AT S1E1WP requires ARMv8.2a");
+ }
} else {
return TokError("invalid operand for AT instruction");
}
@@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64DB::DBarrierMapper();
StringRef Name =
- Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+ Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
ExprLoc, getContext()));
return MatchOperand_Success;
@@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64DB::DBarrierMapper();
unsigned Opt =
- Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
if (!Valid) {
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
@@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
bool IsKnown;
auto MRSMapper = AArch64SysReg::MRSMapper();
- uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(),
- IsKnown);
+ uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (MRSReg != -1U) &&
"register should be -1 if and only if it's unknown");
auto MSRMapper = AArch64SysReg::MSRMapper();
- uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(),
- IsKnown);
+ uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (MSRReg != -1U) &&
"register should be -1 if and only if it's unknown");
auto PStateMapper = AArch64PState::PStateMapper();
uint32_t PStateField =
- PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown);
+ PStateMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (PStateField != -1U) &&
"register should be -1 if and only if it's unknown");
@@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
if (Operands.size() < 2 ||
!static_cast<AArch64Operand &>(*Operands[1]).isReg())
- return true;
+ return Error(Loc, "Only valid when first operand is register");
bool IsXReg =
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
@@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
}
// If it is a label or an imm that cannot fit in a movz, put it into CP.
const MCExpr *CPLoc =
- getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4);
+ getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
return false;
}
@@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
case Match_InvalidMemoryIndexed16:
return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+ case Match_InvalidImm0_1:
+ return Error(Loc, "immediate must be an integer in range [0, 1].");
case Match_InvalidImm0_7:
return Error(Loc, "immediate must be an integer in range [0, 7].");
case Match_InvalidImm0_15:
@@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
unsigned zreg =
- AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+ !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
RegOp.getReg())
? AArch64::WZR
: AArch64::XZR;
@@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// If that fails, try against the alternate table containing long-form NEON:
// "fadd v0.2s, v1.2s, v2.2s"
- if (MatchResult != Match_Success)
+ if (MatchResult != Match_Success) {
+ // But first, save the short-form match result: we can use it in case the
+ // long-form match also fails.
+ auto ShortFormNEONErrorInfo = ErrorInfo;
+ auto ShortFormNEONMatchResult = MatchResult;
+
MatchResult =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+ // Now, both matches failed, and the long-form match failed on the mnemonic
+ // suffix token operand. The short-form match failure is probably more
+ // relevant: use it instead.
+ if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
+ Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
+ ((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
+ MatchResult = ShortFormNEONMatchResult;
+ ErrorInfo = ShortFormNEONErrorInfo;
+ }
+ }
+
+
switch (MatchResult) {
case Match_Success: {
// Perform range checking and other semantic validations
@@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
}
case Match_MissingFeature: {
@@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return showMatchError(IDLoc, MatchResult);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
+
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction");
@@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidMemoryIndexed8SImm7:
case Match_InvalidMemoryIndexed16SImm7:
case Match_InvalidMemoryIndexedSImm9:
+ case Match_InvalidImm0_1:
case Match_InvalidImm0_7:
case Match_InvalidImm0_15:
case Match_InvalidImm0_31:
@@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
if (getParser().parseExpression(Value))
return true;
- getParser().getStreamer().EmitValue(Value, Size);
+ getParser().getStreamer().EmitValue(Value, Size, L);
if (getLexer().is(AsmToken::EndOfStatement))
break;
@@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
Inst.setOpcode(AArch64::TLSDESCCALL);
Inst.addOperand(MCOperand::createExpr(Expr));
- getParser().getStreamer().EmitInstruction(Inst, STI);
+ getParser().getStreamer().EmitInstruction(Inst, getSTI());
return false;
}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index db9fb0e775df..f1f968e73123 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
uint64_t pstate_field = (op1 << 3) | op2;
+ if ((pstate_field == AArch64PState::PAN ||
+ pstate_field == AArch64PState::UAO) && crm > 1)
+ return Fail;
+
Inst.addOperand(MCOperand::createImm(pstate_field));
Inst.addOperand(MCOperand::createImm(crm));
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 7f56c2cf6bb8..d8a810824370 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
unsigned Opcode = MI->getOpcode();
if (Opcode == AArch64::SYSxt)
- if (printSysAlias(MI, O)) {
+ if (printSysAlias(MI, STI, O)) {
printAnnotation(O, Annot);
return;
}
@@ -269,7 +270,7 @@ struct LdStNInstrDesc {
int NaturalOffset;
};
-static LdStNInstrDesc LdStNInstInfo[] = {
+static const LdStNInstrDesc LdStNInstInfo[] = {
{ AArch64::LD1i8, "ld1", ".b", 1, true, 0 },
{ AArch64::LD1i16, "ld1", ".h", 1, true, 0 },
{ AArch64::LD1i32, "ld1", ".s", 1, true, 0 },
@@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = {
{ AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 },
};
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
unsigned Idx;
for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
if (LdStNInstInfo[Idx].Opcode == Opcode)
@@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
- if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+ if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
// Now onto the operands: first a vector list with possible lane
@@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
AArch64InstPrinter::printInst(MI, O, Annot, STI);
}
-bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
#ifndef NDEBUG
unsigned Opcode = MI->getOpcode();
assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
@@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
if (Op1Val == 3 && Op2Val == 1)
Asm = "dc\tcvau";
break;
+ case 12:
+ if (Op1Val == 3 && Op2Val == 1 &&
+ (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
+ Asm = "dc\tcvap";
+ break;
case 14:
if (Op1Val == 3 && Op2Val == 1)
Asm = "dc\tcivac";
@@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
break;
}
break;
+ case 9:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e1rp"; break;
+ case 1: Asm = "at\ts1e1wp"; break;
+ }
+ }
+ break;
+ }
}
} else if (CnVal == 8) {
// TLBI aliases
@@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
O << '#' << prfop;
}
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned psbhintop = MI->getOperand(OpNum).getImm();
+ bool Valid;
+ StringRef Name =
+ AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
+ if (Valid)
+ O << Name;
+ else
+ O << '#' << psbhintop;
+}
+
void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 15dee978e229..ea68d9848b42 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,14 +15,10 @@
#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
-class MCOperand;
-
class AArch64InstPrinter : public MCInstPrinter {
public:
AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
@@ -48,7 +44,8 @@ public:
unsigned AltIdx = AArch64::NoRegAltName);
protected:
- bool printSysAlias(const MCInst *MI, raw_ostream &O);
+ bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
// Operand printers
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -122,6 +119,9 @@ protected:
void printPrefetchOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index ed24343a6f2a..648b1dfc8c5e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) {
return FPUnion.F;
}
+/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP16Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
+ int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x3f)
+ return -1;
+ Mantissa >>= 6;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP16Imm(const APFloat &FPImm) {
+ return getFP16Imm(FPImm.bitcastToAPInt());
+}
+
/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 16d53569b231..d26604f5765d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -128,10 +128,9 @@ public:
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- void EmitValueImpl(const MCExpr *Value, unsigned Size,
- const SMLoc &Loc) override {
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
EmitDataMappingSymbol();
- MCELFStreamer::EmitValueImpl(Value, Size);
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
}
private:
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 921c4b94a729..fbce26e1d9a1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
UseDataRegionDirectives = true;
ExceptionsType = ExceptionHandling::DwarfCFI;
-
- // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use
- // LShr instead of AShr.
- UseLogicalShr = true;
}
const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 28703419514a..a540f49866a9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
Streamer.visitUsedExpr(*getSubExpr());
}
-MCSection *AArch64MCExpr::findAssociatedSection() const {
+MCFragment *AArch64MCExpr::findAssociatedFragment() const {
llvm_unreachable("FIXME: what goes here?");
}
bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
return false;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 1165314e4105..db36a65564ce 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -149,11 +149,10 @@ public:
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override;
+ MCFragment *findAssociatedFragment() const override;
- bool evaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
@@ -162,7 +161,6 @@ public:
}
static bool classof(const AArch64MCExpr *) { return true; }
-
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 741b273073e4..61c96f1d93c1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
Log2Size = llvm::Log2_32(4);
// This encompasses the relocation for the whole 21-bit value.
switch (Sym->getKind()) {
- default:
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "ADR/ADRP relocations must be GOT relative");
+ default: {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "ADR/ADRP relocations must be GOT relative");
+ return false;
+ }
case MCSymbolRefExpr::VK_PAGE:
RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
return true;
@@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation(
// assembler local symbols. If we got here, that's not what we have,
// so complain loudly.
if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "conditional branch requires assembler-local"
- " label. '" +
- Target.getSymA()->getSymbol().getName() +
- "' is external.");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "conditional branch requires assembler-local"
+ " label. '" +
+ Target.getSymA()->getSymbol().getName() +
+ "' is external.");
return;
}
// 14-bit branch relocations should only target internal labels, and so
// should never get here.
if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "Invalid relocation on conditional branch!");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "Invalid relocation on conditional branch!");
return;
}
if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
- Asm)) {
- Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+ Asm)) {
+ Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
return;
}
@@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation(
Type = MachO::ARM64_RELOC_UNSIGNED;
if (IsPCRel) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "PC relative absolute relocation!");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "PC relative absolute relocation!");
+ return;
// FIXME: x86_64 sets the type to a branch reloc here. Should we do
// something similar?
@@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation(
Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
return;
} else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
- Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
// Otherwise, neither symbol can be modified.
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported relocation of modified symbol");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
// We don't support PCrel relocations of differences.
- if (IsPCRel)
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported pc-relative relocation of "
- "difference");
+ if (IsPCRel) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported pc-relative relocation of "
+ "difference");
+ return;
+ }
// AArch64 always uses external relocations. If there is no symbol to use as
// a base address (a local symbol with no preceding non-local symbol),
@@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation(
//
// FIXME: We should probably just synthesize an external symbol and use
// that.
- if (!A_Base)
- Asm.getContext().reportFatalError(
+ if (!A_Base) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + A->getName() +
"'. Must have non-local symbol earlier in section.");
- if (!B_Base)
- Asm.getContext().reportFatalError(
+ return;
+ }
+ if (!B_Base) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + B->getName() +
"'. Must have non-local symbol earlier in section.");
+ return;
+ }
- if (A_Base == B_Base && A_Base)
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported relocation with identical base");
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
(!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
@@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation(
// we need to preserve and merge with the new Target? How about
// the FixedValue?
if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
- &Fixup))
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unable to resolve variable '" +
- Symbol->getName() + "'");
+ &Fixup)) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unable to resolve variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
FixedValue);
}
@@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation(
Value +=
Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base);
} else if (Symbol->isInSection()) {
- if (!CanUseLocalRelocation)
- Asm.getContext().reportFatalError(
+ if (!CanUseLocalRelocation) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + Symbol->getName() +
"'. Must have non-local symbol earlier in section.");
+ return;
+ }
// Adjust the relocation to be section-relative.
// The index is the section ordinal (1-based).
const MCSection &Sec = Symbol->getSection();
@@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation(
return;
}
}
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ Asm.getContext().reportError(Fixup.getLoc(),
"unsupported relocation of variable '" +
Symbol->getName() + "'");
+ return;
}
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 52b000d15b8d..3e86a42d5be6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {}
// The constant pool handling is shared by all AArch64TargetStreamer
// implementations.
const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
- unsigned Size) {
- return ConstantPools->addEntry(Streamer, Expr, Size);
+ unsigned Size,
+ SMLoc Loc) {
+ return ConstantPools->addEntry(Streamer, Expr, Size, Loc);
}
void AArch64TargetStreamer::emitCurrentConstantPool() {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index fcc0d053f6e2..51432830f795 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -24,7 +24,7 @@ public:
/// Callback used to implement the ldr= pseudo.
/// Add a new entry to the constant pool for the current section and return an
/// MCExpr that can be used to refer to the constant pool location.
- const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size);
+ const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc);
/// Callback used to implemnt the .ltorg directive.
/// Emit contents of constant pool for the current section.
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index ee85b65bf39a..78f5289ec26d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings
// v8.1a "Privileged Access Never" extension-specific PStates
{"pan", PAN, {AArch64::HasV8_1aOps}},
+
+ // v8.2a
+ {"uao", UAO, {AArch64::HasV8_2aOps}},
};
AArch64PState::PStateMapper::PStateMapper()
: AArch64NamedImmMapper(PStateMappings, 0) {}
+const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
+ // v8.2a "Statistical Profiling" extension-specific PSB operand
+ {"csync", CSync, {AArch64::FeatureSPE}},
+};
+
+AArch64PSBHint::PSBHintMapper::PSBHintMapper()
+ : AArch64NamedImmMapper(PSBHintMappings, 0) {}
+
const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
{"mdccsr_el0", MDCCSR_EL0, {}},
{"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
@@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
{"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
{"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
{"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
+ {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
{"mvfr0_el1", MVFR0_EL1, {}},
{"mvfr1_el1", MVFR1_EL1, {}},
{"mvfr2_el1", MVFR2_EL1, {}},
@@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
{"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
{"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
{"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-
- // v8.1a "Privileged Access Never" extension-specific system registers
- {"pan", PAN, {AArch64::HasV8_1aOps}},
};
AArch64SysReg::MSRMapper::MSRMapper() {
@@ -804,6 +813,24 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
{"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
{"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
{"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
+
+ // v8.2a registers
+ {"uao", UAO, {AArch64::HasV8_2aOps}},
+
+ // v8.2a "Statistical Profiling extension" registers
+ {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
+ {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}},
+ {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}},
+ {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}},
+ {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}},
+ {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}},
+ {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}},
+ {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}},
+ {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}},
+ {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}},
+ {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}},
+ {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}},
+ {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}},
};
uint32_t
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7e42f8e3601e..f649cb9b8a8d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -337,7 +337,9 @@ namespace AArch64AT {
S12E1R = 0x63c4, // 01 100 0111 1000 100
S12E1W = 0x63c5, // 01 100 0111 1000 101
S12E0R = 0x63c6, // 01 100 0111 1000 110
- S12E0W = 0x63c7 // 01 100 0111 1000 111
+ S12E0W = 0x63c7, // 01 100 0111 1000 111
+ S1E1RP = 0x43c8, // 01 000 0111 1001 000
+ S1E1WP = 0x43c9 // 01 000 0111 1001 001
};
struct ATMapper : AArch64NamedImmMapper {
@@ -463,6 +465,9 @@ namespace AArch64PState {
// v8.1a "Privileged Access Never" extension-specific PStates
PAN = 0x04,
+
+ // v8.2a "User Access Override" extension-specific PStates
+ UAO = 0x03
};
struct PStateMapper : AArch64NamedImmMapper {
@@ -473,6 +478,21 @@ namespace AArch64PState {
}
+namespace AArch64PSBHint {
+ enum PSBHintValues {
+ Invalid = -1,
+ // v8.2a "Statistical Profiling" extension-specific PSB operands
+ CSync = 0x11, // psb csync = hint #0x11
+ };
+
+ struct PSBHintMapper : AArch64NamedImmMapper {
+ const static Mapping PSBHintMappings[];
+
+ PSBHintMapper();
+ };
+
+}
+
namespace AArch64SE {
enum ShiftExtSpecifiers {
Invalid = -1,
@@ -594,6 +614,7 @@ namespace AArch64SysReg {
ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001
ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000
ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001
+ ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010
MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000
MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001
MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010
@@ -1190,6 +1211,24 @@ namespace AArch64SysReg {
SPSR_EL12 = 0xea00, // 11 101 0100 0000 000
ELR_EL12 = 0xea01, // 11 101 0100 0000 001
+ // v8.2a registers
+ UAO = 0xc214, // 11 000 0100 0010 100
+
+ // v8.2a "Statistical Profiling extension" registers
+ PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000
+ PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001
+ PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011
+ PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111
+ PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000
+ PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000
+ PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000
+ PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010
+ PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011
+ PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100
+ PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101
+ PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110
+ PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111
+
// Cyclone specific system registers
CPM_IOACC_CTL_EL3 = 0xff90,
};
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0a05d25189b0..8c3cb567fc7e 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -44,15 +44,21 @@ FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
-FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
void initializeSIFoldOperandsPass(PassRegistry &);
extern char &SIFoldOperandsID;
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -64,6 +70,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
extern char &SIFixControlFlowLiveIntervalsID;
@@ -71,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID;
void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
extern char &SIFixSGPRLiveRangesID;
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
extern Target TheAMDGPUTarget;
extern Target TheGCNTarget;
@@ -85,8 +95,6 @@ enum TargetIndex {
};
}
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
} // End namespace llvm
namespace ShaderType {
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 68b50504ee44..d4af8d2e48d1 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol
"true",
"Force using DS instruction immediate offsets on SI">;
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+ "FlatForGlobal",
+ "true",
+ "Force to generate flat instruction for global">;
+
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"FlatAddressSpace",
"true",
@@ -272,9 +277,14 @@ def isSICI : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
>, AssemblerPredicate<"FeatureGCN1Encoding">;
+def isVI : Predicate <
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN3Encoding">;
+
class PredicateControl {
Predicate SubtargetPredicate;
Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 000000000000..378183927242
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,126 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+ void addAttrToCallers(Function *Intrin, StringRef AttrName);
+ bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+ static char ID;
+
+ AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ const char *getPassName() const override {
+ return "AMDGPU Annotate Kernel Features";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ ModulePass::getAnalysisUsage(AU);
+ }
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+ StringRef AttrName) {
+ SmallPtrSet<Function *, 4> SeenFuncs;
+
+ for (User *U : Intrin->users()) {
+ // CallInst is the only valid user for an intrinsic.
+ CallInst *CI = cast<CallInst>(U);
+
+ Function *CallingFunction = CI->getParent()->getParent();
+ if (SeenFuncs.insert(CallingFunction).second)
+ CallingFunction->addFnAttr(AttrName);
+ }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+ Module &M,
+ ArrayRef<StringRef[2]> IntrinsicToAttr) {
+ bool Changed = false;
+
+ for (const StringRef *Arr : IntrinsicToAttr) {
+ if (Function *Fn = M.getFunction(Arr[0])) {
+ addAttrToCallers(Fn, Arr[1]);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+ Triple TT(M.getTargetTriple());
+
+ static const StringRef IntrinsicToAttr[][2] = {
+ // .x omitted
+ { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+ { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+ // .x omitted
+ { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+ { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+
+ };
+
+ static const StringRef HSAIntrinsicToAttr[][2] = {
+ { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
+
+ { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
+ { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }
+ };
+
+ // TODO: Intrinsics that require queue ptr.
+
+ // We do not need to note the x workitem or workgroup id because they are
+ // always initialized.
+
+ bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+ if (TT.getOS() == Triple::AMDHSA)
+ Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+ return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+ return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 000000000000..dfddc345f286
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,84 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+ public InstVisitor<AMDGPUAnnotateUniformValues> {
+ DivergenceAnalysis *DA;
+
+public:
+ static char ID;
+ AMDGPUAnnotateUniformValues() :
+ FunctionPass(ID) { }
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+
+ void visitLoadInst(LoadInst &I);
+
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+ Value *Ptr = I.getPointerOperand();
+ if (!DA->isUniform(Ptr))
+ return;
+
+ if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+ PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+ return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+ DA = &getAnalysis<DivergenceAnalysis>();
+ visit(F);
+
+ return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+ return new AMDGPUAnnotateUniformValues();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0a5309b16ee5..ba71dc05a8fc 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -100,14 +100,63 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
}
}
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
- // This label is used to mark the end of the .text section.
- const TargetLoweringObjectFile &TLOF = getObjFileLowering();
- OutStreamer->SwitchSection(TLOF.getTextSection());
- MCSymbol *EndOfTextLabel =
- OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
- OutStreamer->EmitLabel(EndOfTextLabel);
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+ if (MFI->isKernel() && STM.isAmdHsaOS()) {
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+ TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
+ ELF::STT_AMDGPU_HSA_KERNEL);
+ }
+
+ AsmPrinter::EmitFunctionEntryLabel();
+}
+
+static bool isModuleLinkage(const GlobalValue *GV) {
+ switch (GV->getLinkage()) {
+ case GlobalValue::InternalLinkage:
+ case GlobalValue::CommonLinkage:
+ return true;
+ case GlobalValue::ExternalLinkage:
+ return false;
+ default: llvm_unreachable("unknown linkage type");
+ }
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
+ AsmPrinter::EmitGlobalVariable(GV);
+ return;
+ }
+
+ if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
+ AsmPrinter::EmitGlobalVariable(GV);
+ return;
+ }
+
+ // Group segment variables aren't emitted in HSA.
+ if (AMDGPU::isGroupSegment(GV))
+ return;
+
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+ if (isModuleLinkage(GV)) {
+ TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+ } else {
+ TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+ }
+
+ const DataLayout &DL = getDataLayout();
+ OutStreamer->PushSection();
+ OutStreamer->SwitchSection(
+ getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+ MCSymbol *GVSym = getSymbol(GV);
+ const Constant *C = GV->getInitializer();
+ OutStreamer->EmitLabel(GVSym);
+ EmitGlobalConstant(DL, C);
+ OutStreamer->PopSection();
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -125,8 +174,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ getSIProgramInfo(KernelInfo, MF);
if (!STM.isAmdHsaOS()) {
- getSIProgramInfo(KernelInfo, MF);
EmitProgramInfoSI(MF, KernelInfo);
}
// Emit directives
@@ -165,6 +214,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
false);
+
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+ false);
+
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer->emitRawComment(
@@ -278,27 +344,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
unsigned width = 0;
bool isSGPR = false;
- if (!MO.isReg()) {
+ if (!MO.isReg())
continue;
- }
+
unsigned reg = MO.getReg();
- if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
- reg == AMDGPU::VCC_HI) {
+ switch (reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
VCCUsed = true;
continue;
- } else if (reg == AMDGPU::FLAT_SCR ||
- reg == AMDGPU::FLAT_SCR_LO ||
- reg == AMDGPU::FLAT_SCR_HI) {
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
FlatUsed = true;
continue;
- }
- switch (reg) {
- default: break;
- case AMDGPU::SCC:
- case AMDGPU::EXEC:
- case AMDGPU::M0:
- continue;
+ default:
+ break;
}
if (AMDGPU::SReg_32RegClass.contains(reg)) {
@@ -348,11 +417,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
- if (VCCUsed)
+ if (VCCUsed || FlatUsed)
MaxSGPR += 2;
- if (FlatUsed)
+ if (FlatUsed) {
MaxSGPR += 2;
+ // 2 additional for VI+.
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ MaxSGPR += 2;
+ }
// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
@@ -368,6 +441,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
}
+ if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ Ctx.emitError("too many user SGPRs used");
+ }
+
ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
@@ -419,18 +497,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
- S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+ // 0 = X, 1 = XY, 2 = XYZ
+ unsigned TIDIGCompCnt = 0;
+ if (MFI->hasWorkItemIDZ())
+ TIDIGCompCnt = 2;
+ else if (MFI->hasWorkItemIDY())
+ TIDIGCompCnt = 1;
+
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
- S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
- S_00B84C_TGID_X_EN(1) |
- S_00B84C_TGID_Y_EN(1) |
- S_00B84C_TGID_Z_EN(1) |
- S_00B84C_TG_SIZE_EN(1) |
- S_00B84C_TIDIG_COMP_CNT(2) |
- S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+ S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+ S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+ S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+ S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+ S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+ S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+ S_00B84C_EXCP_EN_MSB(0) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+ S_00B84C_EXCP_EN(0);
}
static unsigned getRsrcReg(unsigned ShaderType) {
@@ -491,14 +578,53 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
header.compute_pgm_resource_registers =
KernelInfo.ComputePGMRSrc1 |
(KernelInfo.ComputePGMRSrc2 << 32);
- header.code_properties =
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
- AMD_CODE_PROPERTY_IS_PTR64;
+ header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (MFI->hasPrivateSegmentBuffer()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+ if (MFI->hasQueuePtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+ if (MFI->hasKernargSegmentPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+ if (MFI->hasDispatchID())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+ if (MFI->hasFlatScratchInit())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ // TODO: Private segment size
+
+ if (MFI->hasGridWorkgroupCountX()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+ }
+
+ if (MFI->hasGridWorkgroupCountY()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+ }
+
+ if (MFI->hasGridWorkgroupCountZ()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
header.kernarg_segment_byte_size = MFI->ABIArgOffset;
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
-
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 345af9b85e15..817cbfc0c0eb 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,7 +99,9 @@ public:
void EmitFunctionBodyStart() override;
- void EmitEndOfAsmFile(Module &M) override;
+ void EmitFunctionEntryLabel() override;
+
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
new file mode 100644
index 000000000000..2f6b3022dd6e
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
@@ -0,0 +1,26 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
+
+using namespace llvm;
+
+DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
+ const Function &Fn,
+ const Twine &Desc,
+ DiagnosticSeverity Severity)
+ : DiagnosticInfo(getKindID(), Severity),
+ Description(Desc),
+ Fn(Fn) { }
+
+int DiagnosticInfoUnsupported::KindID = 0;
+
+void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
+ DP << "unsupported " << getDescription() << " in " << Fn.getName();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
new file mode 100644
index 000000000000..0fd37e1ede6b
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
@@ -0,0 +1,48 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+namespace llvm {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+ const Twine &Description;
+ const Function &Fn;
+
+ static int KindID;
+
+ static int getKindID() {
+ if (KindID == 0)
+ KindID = llvm::getNextAvailablePluginDiagnosticKind();
+ return KindID;
+ }
+
+public:
+ DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+ DiagnosticSeverity Severity = DS_Error);
+
+ const Function &getFunction() const { return Fn; }
+ const Twine &getDescription() const { return Description; }
+
+ void print(DiagnosticPrinter &DP) const override;
+
+ static bool classof(const DiagnosticInfo *DI) {
+ return DI->getKind() == getKindID();
+ }
+};
+
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 8175786fb9b1..4d84d281d998 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
}
/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+ // Fill in FrameReg output argument.
+ FrameReg = RI->getFrameRegister(MF);
+
// Start the offset at 2 so we don't overwrite work group information.
// XXX: We should only do this when the shader actually uses this
// information.
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 9f31be1af794..257a3da40589 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,14 +8,12 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDIL target
-/// machine.
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
@@ -34,7 +32,8 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
const SpillSlot *
getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 64c54ccb31ff..b33040b4d06a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
/// \brief Defines an instruction selector for the AMDGPU target.
//
//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPURegisterInfo.h"
@@ -20,9 +22,9 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Function.h"
@@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
const AMDGPUSubtarget *Subtarget;
+
public:
AMDGPUDAGToDAGISel(TargetMachine &TM);
virtual ~AMDGPUDAGToDAGISel();
bool runOnMachineFunction(MachineFunction &MF) override;
SDNode *Select(SDNode *N) override;
const char *getPassName() const override;
+ void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
private:
@@ -91,7 +95,7 @@ private:
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
- void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
@@ -108,6 +112,16 @@ private:
SDValue &TFE) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &GLC) const;
+ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
SDNode *SelectAddrSpaceCast(SDNode *N);
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
return N;
}
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+ switch (NumVectorElts) {
+ case 1:
+ return AMDGPU::SReg_32RegClassID;
+ case 2:
+ return AMDGPU::SReg_64RegClassID;
+ case 4:
+ return AMDGPU::SReg_128RegClassID;
+ case 8:
+ return AMDGPU::SReg_256RegClassID;
+ case 16:
+ return AMDGPU::SReg_512RegClassID;
+ }
+
+ llvm_unreachable("invalid vector size");
+}
+
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
EVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsEq(MVT::i32));
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- bool UseVReg = true;
- for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- U != E; ++U) {
- if (!U->isMachineOpcode()) {
- continue;
- }
- const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
- if (!RC) {
- continue;
- }
- if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
- UseVReg = false;
- }
- }
- switch(NumVectorElts) {
- case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
- AMDGPU::SReg_32RegClassID;
- break;
- case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
- AMDGPU::SReg_64RegClassID;
- break;
- case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
- AMDGPU::SReg_128RegClassID;
- break;
- case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
- AMDGPU::SReg_256RegClassID;
- break;
- case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
- AMDGPU::SReg_512RegClassID;
- break;
- default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
- }
+ RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
} else {
// BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
// that adds a 128 bits reg copy when going through TwoAddressInstructions
@@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops);
}
-
- case ISD::LOAD: {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- SDLoc SL(N);
- EVT VT = N->getValueType(0);
-
- if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
- N = glueCopyToM0(N);
- break;
- }
-
- // To simplify the TableGen patters, we replace all i64 loads with
- // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
- // during DAG legalization, however, so places (ExpandUnalignedLoad)
- // in the DAG legalizer assume that if i64 is legal, so doing this
- // promotion early can cause problems.
-
- SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
- LD->getBasePtr(), LD->getMemOperand());
- SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
- MVT::i64, NewLoad);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
- SDNode *Load = glueCopyToM0(NewLoad.getNode());
- SelectCode(Load);
- N = BitCast.getNode();
- break;
- }
-
+ case ISD::LOAD:
case ISD::STORE: {
- // Handle i64 stores here for the same reason mentioned above for loads.
- StoreSDNode *ST = cast<StoreSDNode>(N);
- SDValue Value = ST->getValue();
- if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
-
- SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
- MVT::v2i32, Value);
- SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
- ST->getBasePtr(), ST->getMemOperand());
-
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
-
- if (NewValue.getOpcode() == ISD::BITCAST) {
- Select(NewStore.getNode());
- return SelectCode(NewValue.getNode());
- }
-
- // getNode() may fold the bitcast if its input was another bitcast. If that
- // happens we should only select the new store.
- N = NewStore.getNode();
- }
-
N = glueCopyToM0(N);
break;
}
- case AMDGPUISD::REGISTER_LOAD: {
- if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- break;
- SDValue Addr, Offset;
-
- SDLoc DL(N);
- SelectADDRIndirect(N->getOperand(1), Addr, Offset);
- const SDValue Ops[] = {
- Addr,
- Offset,
- CurDAG->getTargetConstant(0, DL, MVT::i32),
- N->getOperand(0),
- };
- return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
- CurDAG->getVTList(MVT::i32, MVT::i64,
- MVT::Other),
- Ops);
- }
- case AMDGPUISD::REGISTER_STORE: {
- if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- break;
- SDValue Addr, Offset;
- SelectADDRIndirect(N->getOperand(2), Addr, Offset);
- SDLoc DL(N);
- const SDValue Ops[] = {
- N->getOperand(1),
- Addr,
- Offset,
- CurDAG->getTargetConstant(0, DL, MVT::i32),
- N->getOperand(0),
- };
- return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
- CurDAG->getVTList(MVT::Other),
- Ops);
- }
-
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
N->getOperand(0), OffsetVal, WidthVal);
-
}
case AMDGPUISD::DIV_SCALE: {
return SelectDIV_SCALE(N);
@@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return SelectCode(N);
}
-
bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
assert(AS != 0 && "Use checkPrivateAddress instead.");
if (!Ptr)
@@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
if (checkPrivateAddress(N->getMemOperand())) {
if (MMO) {
const PseudoSourceValue *PSV = MMO->getPseudoValue();
- if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+ if (PSV && PSV->isConstantPool()) {
return true;
}
}
@@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
unsigned Opc
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
- // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+ // omod
SDValue Ops[8];
SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
Offset = N1;
return true;
}
- }
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ int64_t ByteOffset = C->getSExtValue();
+ if (isUInt<16>(ByteOffset)) {
+ SDLoc DL(Addr);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset = Addr.getOperand(0);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ // If we have a constant address, prefer to put the constant into the
+ // offset. This can save moves to load the constant address since multiple
+ // operations can share the zero base address register, and enables merging
+ // into read2 / write2 instructions.
- SDLoc DL(Addr);
+ SDLoc DL(Addr);
- // If we have a constant address, prefer to put the constant into the
- // offset. This can save moves to load the constant address since multiple
- // operations can share the zero base address register, and enables merging
- // into read2 / write2 instructions.
- if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
if (isUInt<16>(CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// default case
Base = Addr;
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
return true;
}
+// TODO: If offset is too big, put low 16-bit into offset.
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
SDValue &Offset1) const {
@@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
return true;
}
- }
-
- if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ unsigned DWordOffset0 = C->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+
+ if (isUInt<8>(DWordOffset0)) {
+ SDLoc DL(Addr);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
unsigned DWordOffset1 = DWordOffset0 + 1;
assert(4 * DWordOffset0 == CAddr->getZExtValue());
@@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
return isUInt<12>(Imm->getZExtValue());
}
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {
+ // Subtarget prefers to use flat instruction
+ if (Subtarget->useFlatForGlobal())
+ return false;
+
SDLoc DL(Addr);
GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
if (isLegalMUBUFImmOffset(C1)) {
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return;
+ return true;
} else if (isUInt<32>(C1->getZExtValue())) {
// Illegal offset, store it in soffset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
0);
- return;
+ return true;
}
}
@@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = N0;
VAddr = N1;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- return;
+ return true;
}
// default case -> offset
@@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
@@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return false;
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
if (C->getSExtValue()) {
@@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset,
- SDValue &SLC) const {
+ SDValue &Offset,
+ SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
SDValue GLC, TFE;
@@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
-
- unsigned ScratchOffsetReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
- Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
- ScratchOffsetReg, MVT::i32);
- SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
- SDValue ScratchRsrcDword0 =
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
- SDValue ScratchRsrcDword1 =
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
-
- const SDValue RsrcOps[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
- ScratchRsrcDword0,
- CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- ScratchRsrcDword1,
- CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
- };
- SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, RsrcOps), 0);
- Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
- SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
- MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+ Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
!cast<ConstantSDNode>(Idxen)->getSExtValue() &&
@@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
}
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+/// directly into the instruction. On SI/CI the \p EncodedOffset
+/// will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+ int64_t EncodedOffset) {
+ return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+ SDValue &Offset, bool &Imm) const {
+
+ // FIXME: Handle non-constant offsets.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+ if (!C)
+ return false;
+
+ SDLoc SL(ByteOffsetNode);
+ AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+ int64_t ByteOffset = C->getSExtValue();
+ int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ ByteOffset >> 2 : ByteOffset;
+
+ if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ Imm = true;
+ return true;
+ }
+
+ if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+ return false;
+
+ if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+ // 32-bit Immediates are supported on Sea Islands.
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ } else {
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+ C32Bit), 0);
+ }
+ Imm = false;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+ SDValue &Offset, bool &Imm) const {
+
+ SDLoc SL(Addr);
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = N0;
+ return true;
+ }
+ }
+ SBase = Addr;
+ Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+ Imm = true;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRD(Addr, SBase, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+ SDValue &Offset) const {
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRDOffset(Addr, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
// FIXME: This is incorrect and only enough to be able to compile.
SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
SDLoc DL(N);
+ const MachineFunction &MF = CurDAG->getMachineFunction();
+ DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+ "addrspacecast not implemented");
+ CurDAG->getContext()->diagnose(NotImplemented);
+
assert(Subtarget->hasFlatAddressSpace() &&
"addrspacecast only supported with flat address space!");
- assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
- ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
- "Cannot cast address space to / from constant address!");
-
assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
"Can only cast to / from flat address space!");
@@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
}
-
if (DestSize > SrcSize) {
assert(SrcSize == 32 && DestSize == 64);
@@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+ bool Modified = false;
+
+ // XXX - Other targets seem to be able to do this without a worklist.
+ SmallVector<LoadSDNode *, 8> LoadsToReplace;
+ SmallVector<StoreSDNode *, 8> StoresToReplace;
+
+ for (SDNode &Node : CurDAG->allnodes()) {
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
+ EVT VT = LD->getValueType(0);
+ if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+ continue;
+
+ // To simplify the TableGen patters, we replace all i64 loads with v2i32
+ // loads. Alternatively, we could promote i64 loads to v2i32 during DAG
+ // legalization, however, so places (ExpandUnalignedLoad) in the DAG
+ // legalizer assume that if i64 is legal, so doing this promotion early
+ // can cause problems.
+ LoadsToReplace.push_back(LD);
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
+ // Handle i64 stores here for the same reason mentioned above for loads.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+ continue;
+ StoresToReplace.push_back(ST);
+ }
+ }
+
+ for (LoadSDNode *LD : LoadsToReplace) {
+ SDLoc SL(LD);
+
+ SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+ MVT::i64, NewLoad);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
+ Modified = true;
+ }
+
+ for (StoreSDNode *ST : StoresToReplace) {
+ SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
+ MVT::v2i32, ST->getValue());
+ const SDValue StoreOps[] = {
+ ST->getChain(),
+ NewValue,
+ ST->getBasePtr(),
+ ST->getOffset()
+ };
+
+ CurDAG->UpdateNodeOperands(ST, StoreOps);
+ Modified = true;
+ }
+
+ // XXX - Is this necessary?
+ if (Modified)
+ CurDAG->RemoveDeadNodes();
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3a65f3b56146..222f63161be5 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
@@ -27,50 +28,9 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
using namespace llvm;
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
- const Twine &Description;
- const Function &Fn;
-
- static int KindID;
-
- static int getKindID() {
- if (KindID == 0)
- KindID = llvm::getNextAvailablePluginDiagnosticKind();
- return KindID;
- }
-
-public:
- DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
- DiagnosticSeverity Severity = DS_Error)
- : DiagnosticInfo(getKindID(), Severity),
- Description(Desc),
- Fn(Fn) { }
-
- const Function &getFunction() const { return Fn; }
- const Twine &getDescription() const { return Description; }
-
- void print(DiagnosticPrinter &DP) const override {
- DP << "unsupported " << getDescription() << " in " << Fn.getName();
- }
-
- static bool classof(const DiagnosticInfo *DI) {
- return DI->getKind() == getKindID();
- }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ // This is totally unsupported, just custom lower to produce an error.
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
// We need to custom lower some of the intrinsics
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
- setOperationAction(ISD::UDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::ADDC, VT, Expand);
setOperationAction(ISD::SUBC, VT, Expand);
setOperationAction(ISD::ADDE, VT, Expand);
@@ -429,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- // There are no integer divide instructions, and these expand to a pretty
- // large sequence of instructions.
- setIntDivIsCheap(false);
- setPow2SDivIsCheap(false);
setFsqrtIsCheap(true);
+ // We want to find all load dependencies for long chains of stores to enable
+ // merging into very wide vectors. The problem is with vectors with > 4
+ // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+ // vectors are a legal type, even though we have to split the loads
+ // usually. When we can more precisely specify load legality per address
+ // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+ // smarter so that they can figure out what to do in 2 iterations without all
+ // N > 4 stores on the same chain.
+ GatherAllAliasesMaxDepth = 16;
+
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;
@@ -534,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
return true;
}
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+ // There are few operations which truly have vector input operands. Any vector
+ // operation is going to involve operations on each component, and a
+ // build_vector will be a copy per element, so it always makes sense to use a
+ // build_vector input in place of the extracted element to avoid a copy into a
+ // super register.
+ //
+ // We should probably only do this if all users are extracts only, but this
+ // should be the common case.
+ return true;
+}
+
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
// Truncate is just accessing a subregister.
return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
@@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
return SDValue();
}
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+ DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+ DAG.getContext()->diagnose(NoDynamicAlloca);
+ return SDValue();
+}
+
SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
}
@@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FIN->getIndex();
- unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+ unsigned IgnoredFrameReg;
+ unsigned Offset =
+ TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
Op.getValueType());
}
@@ -1043,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1),
Op.getOperand(2));
- case AMDGPUIntrinsic::AMDGPU_brev:
- return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
-
case Intrinsic::AMDGPU_class:
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -1057,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+ case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
+ return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
}
}
@@ -1077,6 +1069,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
+ // TODO: Should this propagate fast-math-flags?
SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
DAG.getConstantFP(1.0f, DL, MVT::f32),
Op.getOperand(1));
@@ -1167,45 +1160,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
return SDValue();
}
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const {
- if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
- return SDValue();
-
- ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
- switch (CCOpcode) {
- case ISD::SETULE:
- case ISD::SETULT: {
- unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETLE:
- case ISD::SETLT: {
- unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETGT:
- case ISD::SETGE: {
- unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETUGE:
- case ISD::SETUGT: {
- unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- default:
- return SDValue();
- }
-}
-
SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1260,7 +1214,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
EVT PtrVT = BasePtr.getValueType();
EVT MemVT = Load->getMemoryVT();
SDLoc SL(Op);
- MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
EVT LoVT, HiVT;
EVT LoMemVT, HiMemVT;
@@ -1269,23 +1224,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned BaseAlign = Load->getAlignment();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
SDValue LoLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr,
SrcValue,
LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), Load->getAlignment());
+ Load->isInvariant(), BaseAlign);
SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(LoMemVT.getStoreSize(), SL,
- PtrVT));
+ DAG.getConstant(Size, SL, PtrVT));
SDValue HiLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
Load->getChain(), HiPtr,
SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), Load->getAlignment());
+ Load->isInvariant(), HiAlign);
SDValue Ops[] = {
DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1415,7 +1374,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
DAG.getConstant(LoMemVT.getStoreSize(), SL,
PtrVT));
- MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+ const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Store->getAlignment();
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
SDValue LoStore
= DAG.getTruncStore(Chain, SL, Lo,
BasePtr,
@@ -1423,15 +1386,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
LoMemVT,
Store->isNonTemporal(),
Store->isVolatile(),
- Store->getAlignment());
+ BaseAlign);
SDValue HiStore
= DAG.getTruncStore(Chain, SL, Hi,
HiPtr,
- SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ SrcValue.getWithOffset(Size),
HiMemVT,
Store->isNonTemporal(),
Store->isVolatile(),
- Store->getAlignment());
+ HiAlign);
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
}
@@ -1529,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
- return ScalarizeVectorStore(Op, DAG);
+ return SplitVectorStore(Op, DAG);
}
EVT MemVT = Store->getMemoryVT();
@@ -1630,6 +1593,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
// float fb = (float)ib;
SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
+ // TODO: Should this propagate fast-math-flags?
// float fq = native_divide(fa, fb);
SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1940,6 +1904,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
@@ -1968,6 +1934,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
@@ -2045,6 +2012,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
@@ -2074,6 +2043,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
@@ -2184,6 +2155,7 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
@@ -2206,7 +2178,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
DAG.getConstant(32, SL, MVT::i32));
-
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
}
@@ -2231,6 +2203,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
DAG.getConstant(1, DL, MVT::i32));
SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+ // TODO: Should this propagate fast-math-flags?
FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
@@ -2257,7 +2230,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
MVT::f64);
SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
MVT::f64);
-
+ // TODO: Should this propagate fast-math-flags?
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
@@ -2511,12 +2484,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
if (VT == MVT::f32)
return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
- // TODO: Implement min / max Evergreen instructions.
- if (VT == MVT::i32 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
}
break;
@@ -2652,20 +2619,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
return CFP->isExactlyValue(1.0);
}
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- return C->isAllOnesValue();
- }
- return false;
+ return isAllOnesConstant(Op);
}
bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
return CFP->getValueAPF().isZero();
}
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- return C->isNullValue();
- }
- return false;
+ return isNullConstant(Op);
}
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
@@ -2738,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
- NODE_NAME_CASE(BREV)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MAD_U24)
@@ -2893,8 +2853,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
unsigned SignBits = 32 - Width->getZExtValue() + 1;
- ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
- if (!Offset || !Offset->isNullValue())
+ if (!isNullConstant(Op.getOperand(1)))
return SignBits;
// TODO: Could probably figure something out with non-0 offsets.
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 478b2035fd75..7314cc050ba5 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -138,6 +138,7 @@ public:
bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
unsigned AS) const override;
+ bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
@@ -149,6 +150,9 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const;
+
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
void ReplaceNodeResults(SDNode * N,
@@ -165,14 +169,6 @@ public:
SDValue False,
SDValue CC,
DAGCombinerInfo &DCI) const;
- SDValue CombineIMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const;
const char* getTargetNodeName(unsigned Opcode) const override;
@@ -216,7 +212,7 @@ public:
/// \brief Helper function that returns the byte offset of the given
/// type of implicit parameter.
- unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+ uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
const ImplicitParameter Param) const;
};
@@ -267,7 +263,6 @@ enum NodeType : unsigned {
BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
- BREV, // Reverse bits.
MUL_U24,
MUL_I24,
MAD_U24,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 15a3d543a68c..a266e711af5b 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
// TODO: Implement this function
return nullptr;
}
-bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const {
- // TODO: Implement this function
- return false;
-}
bool
AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad,
@@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
return -1;
}
- Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+ unsigned IgnoredFrameReg;
+ Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
+ MF, -1, IgnoredFrameReg);
return getIndirectIndexBegin(MF) + Offset;
}
@@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
+
+ArrayRef<std::pair<int, const char *>>
+AMDGPUInstrInfo::getSerializableTargetIndices() const {
+ static const std::pair<int, const char *> TargetIndices[] = {
+ {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+ return makeArrayRef(TargetIndices);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 86d3962b3856..53e8b23b3d62 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -103,8 +103,6 @@ public:
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexEnd(const MachineFunction &MF) const;
- bool canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const override;
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr *> &NewMIs) const override;
@@ -147,6 +145,9 @@ public:
return get(pseudoToMCOpcode(Opcode));
}
+ ArrayRef<std::pair<int, const char *>>
+ getSerializableTargetIndices() const override;
+
//===---------------------------------------------------------------------===//
// Pure virtual funtions to be implemented by sub-classes.
//===---------------------------------------------------------------------===//
@@ -195,6 +196,7 @@ public:
};
namespace AMDGPU {
+ LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
} // End namespace AMDGPU
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b413897d9d23..70e589c28429 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
-def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
-
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
// performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 72cab39277c6..11f6139deddd 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -514,7 +514,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
SubRegIndex sub_reg>
: Pat<
- (sub_type (vector_extract vec_type:$src, sub_idx)),
+ (sub_type (extractelt vec_type:$src, sub_idx)),
(EXTRACT_SUBREG $src, sub_reg)
>;
@@ -522,7 +522,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
class Insert_Element <ValueType elem_type, ValueType vec_type,
int sub_idx, SubRegIndex sub_reg>
: Pat <
- (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+ (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
(INSERT_SUBREG $vec, $elem, sub_reg)
>;
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ab489cd2a4ab..1de3546485b1 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
- def int_AMDGPU_barrier_local : Intrinsic<[], [], []>;
- def int_AMDGPU_barrier_global : Intrinsic<[], [], []>;
+ def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
+ def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
}
// Legacy names for compatibility.
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 20831460b933..dfc652f31da5 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = MCOperand::createImm(MO.getImm());
break;
case MachineOperand::MO_Register:
- MCOp = MCOperand::createReg(MO.getReg());
+ MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
break;
case MachineOperand::MO_MachineBasicBlock:
MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
break;
}
- case MachineOperand::MO_TargetIndex: {
- assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
- MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
- const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
- MCOp = MCOperand::createExpr(Expr);
- break;
- }
case MachineOperand::MO_ExternalSymbol: {
MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
#endif
if (MI->isBundle()) {
const MachineBasicBlock *MBB = MI->getParent();
- MachineBasicBlock::const_instr_iterator I = MI;
- ++I;
- while (I != MBB->end() && I->isInsideBundle()) {
- EmitInstruction(I);
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ EmitInstruction(&*I);
++I;
}
} else {
@@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
MF->getSubtarget<MCSubtargetInfo>());
- CodeStream.flush();
-
HexLines.resize(HexLines.size() + 1);
std::string &HexLine = HexLines.back();
raw_string_ostream HexStream(HexLine);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da663234..54137177e4c0 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,11 +1,10 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
using namespace llvm;
-static const char *const ShaderTypeAttribute = "ShaderType";
-
// Pin the vtable to this file.
void AMDGPUMachineFunction::anchor() {}
@@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
ShaderType(ShaderType::COMPUTE),
LDSSize(0),
+ ABIArgOffset(0),
ScratchSize(0),
IsKernel(true) {
- Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
- if (A.isStringAttribute()) {
- StringRef Str = A.getValueAsString();
- if (Str.getAsInteger(0, ShaderType))
- llvm_unreachable("Can't parse shader type!");
- }
+ ShaderType = AMDGPU::getShaderType(*MF.getFunction());
}
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index f5e4694e76f6..46fcee874887 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -37,6 +37,11 @@ public:
return ShaderType;
}
+ bool isKernel() const {
+ // FIXME: Assume everything is a kernel until function calls are supported.
+ return true;
+ }
+
unsigned ScratchSize;
bool IsKernel;
};
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 000000000000..554bf1da81f5
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,373 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType = "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+ "kernel_arg_addr_space",
+ "kernel_arg_access_qual",
+ "kernel_arg_type",
+ "kernel_arg_base_type",
+ "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+ MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+ return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+ return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+ if (!Node)
+ return nullptr;
+
+ size_t NumOps = Node->getNumOperands();
+ if (NumOps != NumKernelArgMDNodes + 1)
+ return nullptr;
+
+ auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+ if (!F)
+ return nullptr;
+
+ // Sanity checks.
+ size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+ for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+ if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+ return nullptr;
+ if (!ArgNode->getOperand(0))
+ return nullptr;
+
+ // FIXME: It should be possible to do image lowering when some metadata
+ // args missing or not in the expected order.
+ MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+ if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+ return nullptr;
+ }
+
+ return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+ return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+ return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+ MDVector Res;
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+ Res.push_back(Node->getOperand(OpIdx));
+ }
+ return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+ assert(V.size() == NumKernelArgMDNodes);
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MD.ArgVector[i].push_back(V[i]);
+ }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+ static char ID;
+
+ LLVMContext *Context;
+ Type *Int32Type;
+ Type *ImageSizeType;
+ Type *ImageFormatType;
+ SmallVector<Instruction *, 4> InstsToErase;
+
+ bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+ Argument &ImageSizeArg,
+ Argument &ImageFormatArg) {
+ bool Modified = false;
+
+ for (auto &Use : ImageArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name.startswith(GetImageResourceIDFunc)) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else if (Name.startswith(GetImageSizeFunc)) {
+ Replacement = &ImageSizeArg;
+ } else if (Name.startswith(GetImageFormatFunc)) {
+ Replacement = &ImageFormatArg;
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+ bool Modified = false;
+
+ for (const auto &Use : SamplerArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name == GetSamplerResourceIDFunc) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+ uint32_t NumReadOnlyImageArgs = 0;
+ uint32_t NumWriteOnlyImageArgs = 0;
+ uint32_t NumSamplerArgs = 0;
+
+ bool Modified = false;
+ InstsToErase.clear();
+ for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+ Argument &Arg = *ArgI;
+ StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+ // Handle image types.
+ if (IsImageType(Type)) {
+ StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+ uint32_t ResourceID;
+ if (AccessQual == "read_only") {
+ ResourceID = NumReadOnlyImageArgs++;
+ } else if (AccessQual == "write_only") {
+ ResourceID = NumWriteOnlyImageArgs++;
+ } else {
+ llvm_unreachable("Wrong image access qualifier.");
+ }
+
+ Argument &SizeArg = *(++ArgI);
+ Argument &FormatArg = *(++ArgI);
+ Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+ // Handle sampler type.
+ } else if (IsSamplerType(Type)) {
+ uint32_t ResourceID = NumSamplerArgs++;
+ Modified |= replaceSamplerUses(Arg, ResourceID);
+ }
+ }
+ for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+ InstsToErase[i]->eraseFromParent();
+ }
+
+ return Modified;
+ }
+
+ std::tuple<Function *, MDNode *>
+ addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+ bool Modified = false;
+
+ FunctionType *FT = F->getFunctionType();
+ SmallVector<Type *, 8> ArgTypes;
+
+ // Metadata operands for new MDNode.
+ KernelArgMD NewArgMDs;
+ PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+ // Add implicit arguments to the signature.
+ for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+ ArgTypes.push_back(FT->getParamType(i));
+ MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+ continue;
+
+ // Add size implicit argument.
+ ArgTypes.push_back(ImageSizeType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ // Add format implicit argument.
+ ArgTypes.push_back(ImageFormatType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ Modified = true;
+ }
+ if (!Modified) {
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ // Create function with new signature and clone the old body into it.
+ auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+ auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+ ValueToValueMapTy VMap;
+ auto NewFArgIt = NewF->arg_begin();
+ for (auto &Arg: F->args()) {
+ auto ArgName = Arg.getName();
+ NewFArgIt->setName(ArgName);
+ VMap[&Arg] = &(*NewFArgIt++);
+ if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+ (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+ (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+ }
+ }
+ SmallVector<ReturnInst*, 8> Returns;
+ CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+ // Build new MDNode.
+ SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+ KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+ KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+ MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+ return std::make_tuple(NewF, NewMDNode);
+ }
+
+ bool transformKernels(Module &M) {
+ NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+ if (!KernelsMDNode)
+ return false;
+
+ bool Modified = false;
+ for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+ MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+ Function *F = GetFunctionFromMDNode(KernelMDNode);
+ if (!F)
+ continue;
+
+ Function *NewF;
+ MDNode *NewMDNode;
+ std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+ if (NewF) {
+ // Replace old function and metadata with new ones.
+ F->eraseFromParent();
+ M.getFunctionList().push_back(NewF);
+ M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+ NewF->getAttributes());
+ KernelsMDNode->setOperand(i, NewMDNode);
+
+ F = NewF;
+ KernelMDNode = NewMDNode;
+ Modified = true;
+ }
+
+ Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+ }
+
+ return Modified;
+ }
+
+ public:
+ AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override {
+ Context = &M.getContext();
+ Int32Type = Type::getInt32Ty(M.getContext());
+ ImageSizeType = ArrayType::get(Int32Type, 3);
+ ImageFormatType = ArrayType::get(Int32Type, 2);
+
+ return transformKernels(M);
+ }
+
+ const char *getPassName() const override {
+ return "AMDGPU OpenCL Image Type Pass";
+ }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+ return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 57b7a73bf56c..87d50d587059 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
- const FunctionType *FTy = F.getFunctionType();
+ FunctionType *FTy = F.getFunctionType();
LocalMemAvailable = ST.getLocalMemorySize();
@@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
- const Type *ParamTy = FTy->getParamType(i);
+ Type *ParamTy = FTy->getParamType(i);
if (ParamTy->isPointerTy() &&
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
LocalMemAvailable = 0;
@@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// Check how much local memory is being used by global objects
for (Module::global_iterator I = Mod->global_begin(),
E = Mod->global_end(); I != E; ++I) {
- GlobalVariable *GV = I;
+ GlobalVariable *GV = &*I;
PointerType *GVTy = GV->getType();
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
return false;
}
-static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
return VectorType::get(ArrayTy->getArrayElementType(),
ArrayTy->getArrayNumElements());
}
@@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+ if (!I.isStaticAlloca())
+ return;
+
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index cfd800bdc703..0344834328f6 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
assert(!"Unimplemented"); return BitVector();
}
- virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
- assert(!"Unimplemented"); return nullptr;
- }
-
virtual unsigned getHWRegIndex(unsigned Reg) const {
assert(!"Unimplemented"); return 0;
}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 5f32a65c9338..44e0c47877a9 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -16,6 +16,7 @@
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
+#include "SIFrameLowering.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
// disable it.
SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+ if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+ FullFS += "+flat-for-global,";
FullFS += FS;
if (GPU == "" && TT.getArch() == Triple::amdgcn)
@@ -67,26 +70,36 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DumpCode(false), R600ALUInst(false), HasVertexCache(false),
TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
- CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
- EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
- EnableUnsafeDSOffsetFolding(false),
+ CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
+ EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+ EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
- FrameLowering(TargetFrameLowering::StackGrowsUp,
- 64 * 16, // Maximum stack alignment (long16)
- 0),
+ FrameLowering(nullptr),
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
initializeSubtargetDependencies(TT, GPU, FS);
+ const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
+
if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
InstrInfo.reset(new R600InstrInfo(*this));
TLInfo.reset(new R600TargetLowering(TM, *this));
+
+ // FIXME: Should have R600 specific FrameLowering
+ FrameLowering.reset(new AMDGPUFrameLowering(
+ TargetFrameLowering::StackGrowsUp,
+ MaxStackAlign,
+ 0));
} else {
InstrInfo.reset(new SIInstrInfo(*this));
TLInfo.reset(new SITargetLowering(TM, *this));
+ FrameLowering.reset(new SIFrameLowering(
+ TargetFrameLowering::StackGrowsUp,
+ MaxStackAlign,
+ 0));
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 735f01dfa7c5..9c7bb88f8f4a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
//
// The LLVM Compiler Infrastructure
//
@@ -12,17 +12,15 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
#include "AMDGPU.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUISelLowering.h"
#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "AMDKernelCodeT.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -72,6 +70,7 @@ private:
bool FastFMAF32;
bool CaymanISA;
bool FlatAddressSpace;
+ bool FlatForGlobal;
bool EnableIRStructurizer;
bool EnablePromoteAlloca;
bool EnableIfCvt;
@@ -88,10 +87,10 @@ private:
bool CIInsts;
bool FeatureDisable;
int LDSBankCount;
- unsigned IsaVersion;
+ unsigned IsaVersion;
bool EnableHugeScratchBuffer;
- AMDGPUFrameLowering FrameLowering;
+ std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
InstrItineraryData InstrItins;
@@ -104,7 +103,7 @@ public:
StringRef GPU, StringRef FS);
const AMDGPUFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
+ return FrameLowering.get();
}
const AMDGPUInstrInfo *getInstrInfo() const override {
return InstrInfo.get();
@@ -161,6 +160,10 @@ public:
return FlatAddressSpace;
}
+ bool useFlatForGlobal() const {
+ return FlatForGlobal;
+ }
+
bool hasBFE() const {
return (getGeneration() >= EVERGREEN);
}
@@ -305,6 +308,9 @@ public:
return isAmdHsaOS() ? 0 : 36;
}
+ unsigned getMaxNumUserSGPRs() const {
+ return 16;
+ }
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2297b52b423c..22f85b3e663c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetObjectFile.h"
#include "AMDGPU.h"
#include "AMDGPUTargetTransformInfo.h"
#include "R600ISelLowering.h"
@@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+
+ PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeSILowerI1CopiesPass(*PR);
+ initializeSIFixSGPRCopiesPass(*PR);
+ initializeSIFoldOperandsPass(*PR);
+ initializeSIFixSGPRLiveRangesPass(*PR);
+ initializeSIFixControlFlowLiveIntervalsPass(*PR);
+ initializeSILoadStoreOptimizerPass(*PR);
+ initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+ initializeAMDGPUAnnotateUniformValuesPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.getOS() == Triple::AMDHSA)
+ return make_unique<AMDGPUHSATargetObjectFile>();
+
+ return make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -72,15 +90,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OptLevel)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
OptLevel),
- TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
+ TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() {
- delete TLOF;
-}
+AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
@@ -110,7 +126,13 @@ namespace {
class AMDGPUPassConfig : public TargetPassConfig {
public:
AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+
+ // Exceptions and StackMaps are not supported, so these passes will never do
+ // anything.
+ disablePass(&StackMapLivenessID);
+ disablePass(&FuncletLayoutID);
+ }
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
return getTM<AMDGPUTargetMachine>();
@@ -126,8 +148,9 @@ public:
void addIRPasses() override;
void addCodeGenPrepare() override;
- virtual bool addPreISel() override;
- virtual bool addInstSelector() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ bool addGCPasses() override;
};
class R600PassConfig : public AMDGPUPassConfig {
@@ -147,6 +170,8 @@ public:
: AMDGPUPassConfig(TM, PM) { }
bool addPreISel() override;
bool addInstSelector() override;
+ void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+ void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreSched2() override;
@@ -156,7 +181,7 @@ public:
} // End of anonymous namespace
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(
AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
});
@@ -172,6 +197,10 @@ void AMDGPUPassConfig::addIRPasses() {
// functions, then we will generate code for the first function
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
+
+ // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+ addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
TargetPassConfig::addIRPasses();
}
@@ -198,6 +227,11 @@ bool AMDGPUPassConfig::addInstSelector() {
return false;
}
+bool AMDGPUPassConfig::addGCPasses() {
+ // Do nothing. GC is not supported.
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// R600 Pass Setup
//===----------------------------------------------------------------------===//
@@ -238,16 +272,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+
+ // FIXME: We need to run a pass to propagate the attributes when calls are
+ // supported.
+ addPass(&AMDGPUAnnotateKernelFeaturesID);
+
addPass(createSinkingPass());
addPass(createSITypeRewriter());
addPass(createSIAnnotateControlFlowPass());
+ addPass(createAMDGPUAnnotateUniformValues());
+
return false;
}
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
- addPass(createSIFixSGPRCopiesPass(*TM));
+ addPass(&SIFixSGPRCopiesID);
addPass(createSIFoldOperandsPass());
return false;
}
@@ -259,7 +300,6 @@ void GCNPassConfig::addPreRegAlloc() {
// earlier passes might recompute live intervals.
// TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
if (getOptLevel() > CodeGenOpt::None) {
- initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
}
@@ -269,16 +309,27 @@ void GCNPassConfig::addPreRegAlloc() {
// This should be run after scheduling, but before register allocation. It
// also need extra copies to the address operand to be eliminated.
- initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
addPass(createSIShrinkInstructionsPass(), false);
- addPass(createSIFixSGPRLiveRangesPass(), false);
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ addPass(&SIFixSGPRLiveRangesID);
+ TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // We want to run this after LiveVariables is computed to avoid computing them
+ // twice.
+ // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
+ // that needs to be fixed.
+ insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
+ TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
void GCNPassConfig::addPostRegAlloc() {
- addPass(createSIPrepareScratchRegs(), false);
addPass(createSIShrinkInstructionsPass(), false);
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 14792e347a7a..236e3f824030 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
private:
protected:
- TargetLoweringObjectFile *TLOF;
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
AMDGPUSubtarget Subtarget;
AMDGPUIntrinsicInfo IntrinsicInfo;
@@ -52,7 +52,7 @@ public:
TargetIRAnalysis getTargetIRAnalysis() override;
TargetLoweringObjectFile *getObjFileLowering() const override {
- return TLOF;
+ return TLOF.get();
}
};
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 000000000000..e050f21091ba
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+ SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const {
+ if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+ return TextSection;
+
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
+void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM){
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ TextSection = AMDGPU::getHSATextSection(Ctx);
+
+ DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+ DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+ RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+ const char *SectionName) const {
+ return cast<MCSectionELF>(DataGlobalAgentSection)
+ ->getSectionName()
+ .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+ // Read-only segments can only have agent allocation.
+ return AMDGPU::isReadOnlySegment(GV) ||
+ (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+ isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+ const GlobalValue *GV) const {
+ // The default for global segments is program allocation.
+ return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
+}
+
+MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
+ const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const {
+ if (Kind.isText() && !GV->hasComdat())
+ return getTextSection();
+
+ if (AMDGPU::isGlobalSegment(GV)) {
+ if (isAgentAllocation(GV))
+ return DataGlobalAgentSection;
+
+ if (isProgramAllocation(GV))
+ return DataGlobalProgramSection;
+ }
+
+ return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 000000000000..921341ebb897
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,51 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
+private:
+ MCSection *DataGlobalAgentSection;
+ MCSection *DataGlobalProgramSection;
+ MCSection *RodataReadonlyAgentSection;
+
+ bool isAgentAllocationSection(const char *SectionName) const;
+ bool isAgentAllocation(const GlobalValue *GV) const;
+ bool isProgramAllocation(const GlobalValue *GV) const;
+
+public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc742b129..54a003d6a9cf 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+ return Vector ? 0 : 32;
+}
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Semi-arbitrary large amount.
return 64;
}
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+ // XXX - For some reason this isn't called for switch.
+ switch (Opcode) {
+ case Instruction::Br:
+ case Instruction::Ret:
+ return 10;
+ default:
+ return BaseT::getCFInstrCost(Opcode);
+ }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ switch (Opcode) {
+ case Instruction::ExtractElement:
+ // Dynamic indexing isn't free and is best avoided.
+ return Index == ~0u ? 2 : 0;
+ default:
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+ const IntrinsicInst *I) {
+ switch (I->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::not_intrinsic:
+ // This means we have an intrinsic that isn't defined in
+ // IntrinsicsAMDGPU.td
+ break;
+
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::r600_read_tidig_x:
+ case Intrinsic::r600_read_tidig_y:
+ case Intrinsic::r600_read_tidig_z:
+ return true;
+ }
+
+ StringRef Name = I->getCalledFunction()->getName();
+ switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+ default:
+ return false;
+ case AMDGPUIntrinsic::SI_tid:
+ case AMDGPUIntrinsic::SI_fs_interp:
+ return true;
+ }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+ const Function *F = A->getParent();
+ unsigned ShaderType = AMDGPU::getShaderType(*F);
+
+ // Arguments to compute shaders are never a source of divergence.
+ if (ShaderType == ShaderType::COMPUTE)
+ return true;
+
+ // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+ if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+ return true;
+
+ // Everything else is in VGPRs.
+ return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+ if (const Argument *A = dyn_cast<Argument>(V))
+ return !isArgPassedInSGPR(A);
+
+ // Loads from the private address space are divergent, because threads
+ // can execute the load instruction with the same inputs and get different
+ // results.
+ //
+ // All other loads are not divergent, because if threads issue loads with the
+ // same arguments, they will always get the same result.
+ if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+ return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+ return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+ }
+
+ // Assume all function calls are a source of divergence.
+ if (isa<CallInst>(V) || isa<InvokeInst>(V))
+ return true;
+
+ return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69d1e68..976afb03443b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,11 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
unsigned getMaxInterleaveFactor(unsigned VF);
+
+ unsigned getCFInstrCost(unsigned Opcode);
+
+ int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ bool isSourceOfDivergence(const Value *V) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index d918ac3a5b3b..917efd149e00 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -185,7 +185,7 @@ protected:
MachinePostDominatorTree *PDT;
MachineLoopInfo *MLI;
const R600InstrInfo *TII;
- const AMDGPURegisterInfo *TRI;
+ const R600RegisterInfo *TRI;
// PRINT FUNCTIONS
/// Print the ordered Blocks.
@@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() {
} //while, "one iteration" over the function.
MachineBasicBlock *EntryMBB =
- GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+ &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
DEBUG(
@@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() {
} while (!Finish && MakeProgress);
// Misc wrap up to maintain the consistency of the Function representation.
- wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+ wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
// Detach retired Block, release memory.
for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
@@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
E = ContMBB.end(); It != E; ++It) {
- (*It)->removeSuccessor(LoopHeader);
+ (*It)->removeSuccessor(LoopHeader, true);
}
numLoopcontPatternMatch += NumCont;
@@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// If MigrateTrue is true, then TrueBB is the block being "branched into"
// and if MigrateFalse is true, then FalseBB is the block being
// "branched into"
- //
+ //
// Here is the pseudo code for how I think the optimization should work:
// 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
// 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
@@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// the late machine optimization passes, however if we implement
// bool TargetRegisterInfo::requiresRegisterScavenging(
// const MachineFunction &MF)
- // and have it return true, liveness will be tracked correctly
+ // and have it return true, liveness will be tracked correctly
// by generic optimization passes. We will also need to make sure that
// all of our target-specific passes that run after regalloc and before
// the CFGStructurizer track liveness and we will need to modify this pass
@@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
);
DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
- DstMBB->removeSuccessor(SrcMBB);
+ DstMBB->removeSuccessor(SrcMBB, true);
cloneSuccessorList(DstMBB, SrcMBB);
removeSuccessor(SrcMBB);
@@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
if (TrueMBB) {
MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
- MBB->removeSuccessor(TrueMBB);
+ MBB->removeSuccessor(TrueMBB, true);
if (LandMBB && TrueMBB->succ_size()!=0)
- TrueMBB->removeSuccessor(LandMBB);
+ TrueMBB->removeSuccessor(LandMBB, true);
retireBlock(TrueMBB);
MLI->removeBlock(TrueMBB);
}
@@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
insertInstrBefore(I, AMDGPU::ELSE);
MBB->splice(I, FalseMBB, FalseMBB->begin(),
FalseMBB->end());
- MBB->removeSuccessor(FalseMBB);
+ MBB->removeSuccessor(FalseMBB, true);
if (LandMBB && FalseMBB->succ_size() != 0)
- FalseMBB->removeSuccessor(LandMBB);
+ FalseMBB->removeSuccessor(LandMBB, true);
retireBlock(FalseMBB);
MLI->removeBlock(FalseMBB);
}
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
- DstBlk->addSuccessor(LandMBB);
- DstBlk->removeSuccessor(DstBlk);
+ DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
@@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
//now branchInst can be erase safely
BranchMI->eraseFromParent();
//now take care of successors, retire blocks
- ExitingMBB->removeSuccessor(LandMBB);
+ ExitingMBB->removeSuccessor(LandMBB, true);
}
void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
//srcBlk, oldBlk, newBlk
- PredMBB->removeSuccessor(MBB);
- PredMBB->addSuccessor(CloneMBB);
+ PredMBB->replaceSuccessor(MBB, CloneMBB);
// add all successor to cloneBlk
cloneSuccessorList(CloneMBB, MBB);
@@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
);
SpliceEnd = SrcMBB->end();
} else {
- DEBUG(
- dbgs() << "migrateInstruction see branch instr\n" ;
- BranchMI->dump();
- );
+ DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
SpliceEnd = BranchMI;
}
DEBUG(
@@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
DEBUG(
dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << "\n";
+ << "srcSize = " << SrcMBB->size() << '\n';
);
}
@@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
// test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
while ((BranchMI = getLoopendBlockBranchInstr(MBB))
&& isUncondBranch(BranchMI)) {
- DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+ DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
}
}
@@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
assert(BranchMI && isCondBranch(BranchMI));
- DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+ DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
SHOWNEWBLK(MBB1, "Removing redundant successor");
- MBB->removeSuccessor(MBB1);
+ MBB->removeSuccessor(MBB1, true);
}
void AMDGPUCFGStructurizer::addDummyExitBlock(
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2018983bc306..d9f753f40133 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -28,7 +28,9 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -83,6 +85,7 @@ public:
unsigned RegNo;
int Modifiers;
const MCRegisterInfo *TRI;
+ const MCSubtargetInfo *STI;
bool IsForcedVOP3;
};
@@ -102,7 +105,7 @@ public:
}
void addRegOperands(MCInst &Inst, unsigned N) const {
- Inst.addOperand(MCOperand::createReg(getReg()));
+ Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
}
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
@@ -215,6 +218,10 @@ public:
(isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
}
+ bool isSCSrc64() const {
+ return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+ }
+
bool isVCSrc32() const {
return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
}
@@ -251,7 +258,22 @@ public:
return EndLoc;
}
- void print(raw_ostream &OS) const override { }
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case Register:
+ OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+ break;
+ case Immediate:
+ OS << getImm();
+ break;
+ case Token:
+ OS << '\'' << getToken() << '\'';
+ break;
+ case Expression:
+ OS << "<expr " << *Expr << '>';
+ break;
+ }
+ }
static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
enum ImmTy Type = ImmTyNone,
@@ -278,10 +300,12 @@ public:
static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
SMLoc E,
const MCRegisterInfo *TRI,
+ const MCSubtargetInfo *STI,
bool ForceVOP3) {
auto Op = llvm::make_unique<AMDGPUOperand>(Register);
Op->Reg.RegNo = RegNo;
Op->Reg.TRI = TRI;
+ Op->Reg.STI = STI;
Op->Reg.Modifiers = -1;
Op->Reg.IsForcedVOP3 = ForceVOP3;
Op->StartLoc = S;
@@ -301,14 +325,32 @@ public:
bool isDSOffset01() const;
bool isSWaitCnt() const;
bool isMubufOffset() const;
+ bool isSMRDOffset() const;
+ bool isSMRDLiteralOffset() const;
};
class AMDGPUAsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
const MCInstrInfo &MII;
MCAsmParser &Parser;
unsigned ForcedEncodingSize;
+
+ bool isSI() const {
+ return AMDGPU::isSI(getSTI());
+ }
+
+ bool isCI() const {
+ return AMDGPU::isCI(getSTI());
+ }
+
+ bool isVI() const {
+ return AMDGPU::isVI(getSTI());
+ }
+
+ bool hasSGPR102_SGPR103() const {
+ return !isVI();
+ }
+
/// @name Auto-generated Match Functions
/// {
@@ -323,20 +365,34 @@ private:
bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
+ bool ParseSectionDirectiveHSAText();
+ bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+ bool ParseDirectiveAMDGPUHsaKernel();
+ bool ParseDirectiveAMDGPUHsaModuleGlobal();
+ bool ParseDirectiveAMDGPUHsaProgramGlobal();
+ bool ParseSectionDirectiveHSADataGlobalAgent();
+ bool ParseSectionDirectiveHSADataGlobalProgram();
+ bool ParseSectionDirectiveHSARodataReadonlyAgent();
public:
- AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
+public:
+ enum AMDGPUMatchResultTy {
+ Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+ };
+
+ AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
- ForcedEncodingSize(0){
+ : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+ ForcedEncodingSize(0) {
+ MCAsmParserExtension::Initialize(Parser);
- if (STI.getFeatureBits().none()) {
+ if (getSTI().getFeatureBits().none()) {
// Set default features.
- STI.ToggleFeature("SOUTHERN_ISLANDS");
+ copySTI().ToggleFeature("SOUTHERN_ISLANDS");
}
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
AMDGPUTargetStreamer &getTargetStreamer() {
@@ -420,10 +476,10 @@ struct OptionalOperand {
}
-static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
+static int getRegClass(bool IsVgpr, unsigned RegWidth) {
if (IsVgpr) {
switch (RegWidth) {
- default: llvm_unreachable("Unknown register width");
+ default: return -1;
case 1: return AMDGPU::VGPR_32RegClassID;
case 2: return AMDGPU::VReg_64RegClassID;
case 3: return AMDGPU::VReg_96RegClassID;
@@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
}
switch (RegWidth) {
- default: llvm_unreachable("Unknown register width");
+ default: return -1;
case 1: return AMDGPU::SGPR_32RegClassID;
case 2: return AMDGPU::SGPR_64RegClassID;
case 4: return AMDGPU::SReg_128RegClassID;
@@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
}
}
-static unsigned getRegForName(const StringRef &RegName) {
+static unsigned getRegForName(StringRef RegName) {
return StringSwitch<unsigned>(RegName)
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
- .Case("flat_scr", AMDGPU::FLAT_SCR)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("m0", AMDGPU::M0)
.Case("scc", AMDGPU::SCC)
- .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
- .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
const AsmToken Tok = Parser.getTok();
StartLoc = Tok.getLoc();
EndLoc = Tok.getEndLoc();
- const StringRef &RegName = Tok.getString();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ StringRef RegName = Tok.getString();
RegNo = getRegForName(RegName);
if (RegNo) {
Parser.Lex();
- return false;
+ return !subtargetHasRegister(*TRI, RegNo);
}
// Match vgprs and sgprs
@@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
RegIndexInClass = RegLo;
} else {
// SGPR registers are aligned. Max alignment is 4 dwords.
- RegIndexInClass = RegLo / std::min(RegWidth, 4u);
+ unsigned Size = std::min(RegWidth, 4u);
+ if (RegLo % Size != 0)
+ return true;
+
+ RegIndexInClass = RegLo / Size;
}
}
- const MCRegisterInfo *TRC = getContext().getRegisterInfo();
- unsigned RC = getRegClass(IsVgpr, RegWidth);
- if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
+ int RCID = getRegClass(IsVgpr, RegWidth);
+ if (RCID == -1)
return true;
- RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
- return false;
+
+ const MCRegisterClass RC = TRI->getRegClass(RCID);
+ if (RegIndexInClass >= RC.getNumRegs())
+ return true;
+
+ RegNo = RC.getRegister(RegIndexInClass);
+ return !subtargetHasRegister(*TRI, RegNo);
}
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
(getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
return Match_InvalidOperand;
+ if ((TSFlags & SIInstrFlags::VOP3) &&
+ (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+ getForcedEncodingSize() != 64)
+ return Match_PreferE32;
+
return Match_Success;
}
@@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
default: break;
case Match_Success:
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature:
return Error(IDLoc, "instruction not supported on this GPU");
@@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
return Error(ErrorLoc, "invalid operand for instruction");
}
+ case Match_PreferE32:
+ return Error(IDLoc, "internal error: instruction without _e64 suffix "
+ "should be encoded as e32");
}
llvm_unreachable("Implement any new match types added!");
}
@@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// If this directive has no arguments, then use the ISA version for the
// targeted GPU.
if (getLexer().is(AsmToken::EndOfStatement)) {
- AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits());
+ AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
Isa.Stepping,
"AMD", "AMDGPU");
@@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
amd_kernel_code_t Header;
- AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits());
+ AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
while (true) {
@@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
return false;
}
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSATextSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef KernelName = Parser.getTok().getString();
+
+ getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+ ELF::STT_AMDGPU_HSA_KERNEL);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalAgentSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalProgramSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+ return false;
+}
+
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
@@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amd_kernel_code_t")
return ParseDirectiveAMDKernelCodeT();
+ if (IDVal == ".hsatext" || IDVal == ".text")
+ return ParseSectionDirectiveHSAText();
+
+ if (IDVal == ".amdgpu_hsa_kernel")
+ return ParseDirectiveAMDGPUHsaKernel();
+
+ if (IDVal == ".amdgpu_hsa_module_global")
+ return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+ if (IDVal == ".amdgpu_hsa_program_global")
+ return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+ if (IDVal == ".hsadata_global_agent")
+ return ParseSectionDirectiveHSADataGlobalAgent();
+
+ if (IDVal == ".hsadata_global_program")
+ return ParseSectionDirectiveHSADataGlobalProgram();
+
+ if (IDVal == ".hsarodata_readonly_agent")
+ return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+ return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+ unsigned RegNo) const {
+ if (isCI())
+ return true;
+
+ if (isSI()) {
+ // No flat_scr
+ switch (RegNo) {
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+ // SI/CI have.
+ for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return false;
+ }
+
return true;
}
@@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
int64_t IntVal;
if (getParser().parseAbsoluteExpression(IntVal))
return MatchOperand_ParseFail;
- APInt IntVal32(32, IntVal);
- if (IntVal32.getSExtValue() != IntVal) {
+ if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
Error(S, "invalid immediate: only 32-bit values are legal");
return MatchOperand_ParseFail;
}
- IntVal = IntVal32.getSExtValue();
if (Negate)
IntVal *= -1;
Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
@@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
Operands.push_back(AMDGPUOperand::CreateReg(
- RegNo, S, E, getContext().getRegisterInfo(),
+ RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
isForcedVOP3()));
if (HasModifiers || Modifiers) {
@@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) {
}
//===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset() const {
+
+ // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget
+ // information here.
+ return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+ // 32-bit literals are only supported on CI and we only want to use them
+ // when the offset is > 8-bits.
+ return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
// vop3
//===----------------------------------------------------------------------===//
@@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
- ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
- unsigned i = 2;
+
+ unsigned i = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ if (Desc.getNumDefs() > 0) {
+ ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+ }
std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
index 2f5fdbe92078..88a090d3df35 100644
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ b/lib/Target/AMDGPU/CIInstructions.td
@@ -8,6 +8,22 @@
//===----------------------------------------------------------------------===//
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
def isCIVI : Predicate <
@@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
let SubtargetPredicate = isCIVI in {
+let SchedRW = [WriteDoubleAdd] in {
defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
VOP_F64_F64, ftrunc
>;
@@ -35,82 +52,218 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
VOP_F64_F64, frint
>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
VOP_F32_F32
>;
defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
VOP_F32_F32
>;
+} // End SchedRW = [WriteQuarterRate32]
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+ VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+ VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+ VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+ VOP_I64_I32_I32_I64
+>;
+
+// XXX - Does this set VCC?
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+ VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
+
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
+ "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
+ "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
+>;
//===----------------------------------------------------------------------===//
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
- 0x1d, "flat_store_dwordx2", VReg_64
+defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
+ flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
+>;
+defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
+ flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
+>;
+defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
+ flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
+>;
+defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
+ flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
+;
+defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
+ flat<0xc, 0x14>, "flat_load_dword", VGPR_32
+>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
+ flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
+>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
+ flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
+>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
+ flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
+>;
+defm FLAT_STORE_BYTE : FLAT_Store_Helper <
+ flat<0x18>, "flat_store_byte", VGPR_32
+>;
+defm FLAT_STORE_SHORT : FLAT_Store_Helper <
+ flat <0x1a>, "flat_store_short", VGPR_32
+>;
+defm FLAT_STORE_DWORD : FLAT_Store_Helper <
+ flat<0x1c>, "flat_store_dword", VGPR_32
+>;
+defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+ flat<0x1d>, "flat_store_dwordx2", VReg_64
+>;
+defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+ flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
- 0x1e, "flat_store_dwordx4", VReg_128
+defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+ flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
- 0x1f, "flat_store_dwordx3", VReg_96
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
+ flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
>;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
- 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
- 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+ flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
+ flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
+ flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
+ flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
+ flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
+ flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
+ flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
+ flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
+ flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
+ flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
+ flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
+ flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
+ flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
- 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
- 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+ flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
+ flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
+ flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
+ flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
+ flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
+ flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
+ flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
+ flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
+ flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
+ flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
+ flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
+ flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
} // End SubtargetPredicate = isCIVI
+// CI Only flat instructions
+
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+ flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
+ flat<0x3f>, "flat_atomic_fmin", VGPR_32
+>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
+ flat<0x40>, "flat_atomic_fmax", VGPR_32
+>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+ flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
+ flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
+ flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+>;
+
+} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -147,3 +300,80 @@ def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
} // End HasFlatAddressSpace predicate
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+ (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+ (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+ (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+ (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+ (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+ (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
+
+//===----------------------------------------------------------------------===//
+// Patterns to generate flat for global
+//===----------------------------------------------------------------------===//
+
+def useFlatForGlobal : Predicate <
+ "Subtarget->useFlatForGlobal() || "
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
+
+let Predicates = [useFlatForGlobal] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr, vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+} // End Predicates = [useFlatForGlobal]
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 9460bf6b9338..30bb0e0adde8 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -15,12 +15,17 @@ add_public_tablegen_target(AMDGPUCommonTableGen)
add_llvm_target(AMDGPUCodeGen
AMDILCFGStructurizer.cpp
AMDGPUAlwaysInlinePass.cpp
+ AMDGPUAnnotateKernelFeatures.cpp
+ AMDGPUAnnotateUniformValues.cpp
AMDGPUAsmPrinter.cpp
+ AMDGPUDiagnosticInfoUnsupported.cpp
AMDGPUFrameLowering.cpp
+ AMDGPUTargetObjectFile.cpp
AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp
AMDGPUMCInstLower.cpp
AMDGPUMachineFunction.cpp
+ AMDGPUOpenCLImageTypeLoweringPass.cpp
AMDGPUSubtarget.cpp
AMDGPUTargetMachine.cpp
AMDGPUTargetTransformInfo.cpp
@@ -45,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
SIFixSGPRCopies.cpp
SIFixSGPRLiveRanges.cpp
SIFoldOperands.cpp
+ SIFrameLowering.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
@@ -52,7 +58,6 @@ add_llvm_target(AMDGPUCodeGen
SILowerControlFlow.cpp
SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
- SIPrepareScratchRegs.cpp
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index ba4df82a6d37..a6c3785c815b 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+ let eop = 0; // This bit is not used on Cayman.
+}
+
class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
: VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 7adcd46fe196..779a14e95d22 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
: EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
"MEM_RAT "#name, pattern>;
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+ : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+ i32imm:$rat_id, InstFlag:$eop),
+ "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+ #!if(has_eop, ", $eop", ""),
+ [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+ R600_Reg128:$index_gpr,
+ (i32 imm:$rat_id))]>;
+
def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
"MSKOR $rw_gpr.XW, $index_gpr",
@@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
[(global_store v4i32:$rw_gpr, i32:$index_gpr)]
>;
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
} // End usesCustomInserter = 1
class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index e811d5cff221..a187de88f639 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
O << "4.0";
else if (Imm == DoubleToBits(-4.0))
O << "-4.0";
- else
- llvm_unreachable("64-bit literal constants not supported");
+ else {
+ assert(isUInt<32>(Imm));
+
+ // In rare situations, we will have a 32-bit literal in a 64-bit
+ // operand. This is technically allowed for the encoding of s_mov_b64.
+ O << formatHex(static_cast<uint64_t>(Imm));
+ }
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
} else {
unsigned Stream = (SImm16 >> 8) & 0x3;
if (Op == 1)
- O << "cut";
+ O << "cut";
else if (Op == 2)
- O << "emit";
+ O << "emit";
else if (Op == 3)
- O << "emit-cut";
+ O << "emit-cut";
O << " stream " << Stream;
}
O << "), [m0] ";
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 14fb511e9232..90541d86132d 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -13,9 +13,7 @@
#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
namespace llvm {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b119c6..60e8c8f3d303 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
case AMDGPU::fixup_si_rodata: {
uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
- *Dst = Value;
- break;
- }
-
- case AMDGPU::fixup_si_end_of_text: {
- uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
- // The value points to the last instruction in the text section, so we
- // need to add 4 bytes to get to the start of the constants.
+ // We emit constant data at the end of the text section and generate its
+ // address using the following code sequence:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // the fixup replaces $symbol with a literal constant, which is a
+ // pc-relative offset from the encoding of the $symbol operand to the
+ // constant data.
+ //
+ // What we want here is an offset from the start of the s_add_u32
+ // instruction to the constant data, but since the encoding of $symbol
+ // starts 4 bytes after the start of the add instruction, we end up
+ // with an offset that is 4 bytes too small. This requires us to
+ // add 4 to the fixup value before applying it.
*Dst = Value + 4;
break;
}
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
// name offset bits flags
{ "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_si_rodata", 0, 32, 0 },
- { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
+ { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
};
if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 000000000000..9ff9fe794d2b
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,26 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
+ // Start with the .hsatext section by default.
+ SwitchSection(AMDGPU::getHSATextSection(getContext()));
+}
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+ MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 000000000000..488d7e74d741
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,40 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+ AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+ virtual void InitSections(bool NoExecStac) override;
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d67ffd9..59a9178082f6 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
/// fixup for global addresses with constant initializers
fixup_si_rodata,
- /// fixup for offset from instruction to end of text section
- fixup_si_end_of_text,
-
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 028a86dfc7ad..68b1d1ae83cc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
InlineAsmEnd = ";#ASMEND";
//===--- Data Emission Directives -------------------------------------===//
- ZeroDirective = ".zero";
- AsciiDirective = ".ascii\t";
- AscizDirective = ".asciz\t";
- Data8bitsDirective = ".byte\t";
- Data16bitsDirective = ".short\t";
- Data32bitsDirective = ".long\t";
- Data64bitsDirective = ".quad\t";
SunStyleELFSectionSwitchSyntax = true;
UsesELFSectionDirectiveForBSS = true;
@@ -41,3 +34,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
}
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+ return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+ SectionName == ".hsadata_global_program" ||
+ SectionName == ".hsarodata_readonly_agent" ||
+ MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a5bac51e356f..a546961705d7 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -21,12 +21,13 @@ class Triple;
// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
// you will need to make sure your new class sets PrivateGlobalPrefix to
-// a prefix that won't appeary in a fuction name. The default value
+// a prefix that won't appear in a function name. The default value
// for PrivateGlobalPrefix is 'L', so it will consider any function starting
// with 'L' as a local symbol.
class AMDGPUMCAsmInfo : public MCAsmInfoELF {
public:
explicit AMDGPUMCAsmInfo(const Triple &TT);
+ bool shouldOmitSectionDirective(StringRef SectionName) const override;
};
} // namespace llvm
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c709741f3777..f70409470276 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
#include "AMDGPUMCAsmInfo.h"
#include "AMDGPUTargetStreamer.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
@@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
return new AMDGPUTargetELFStreamer(S);
}
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ if (T.getOS() == Triple::AMDHSA)
+ return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+ return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
extern "C" void LLVMInitializeAMDGPUTargetMC() {
for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
@@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+ TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
}
// R600 specific registration
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 09e6cb1f1ffc..b91134d2ee9b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUTargetStreamer.h"
#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
@@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
}
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ switch (Type) {
+ default: llvm_unreachable("Invalid AMDGPU symbol type");
+ case ELF::STT_AMDGPU_HSA_KERNEL:
+ OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+ break;
+ }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
@@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection());
+ // The MCObjectFileInfo that is available to the assembler is a generic
+ // implementation and not AMDGPUHSATargetObjectFile, so we can't use
+ // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
+ OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
OS.PopSection();
}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(SymbolName));
+ Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index d37677c6b863..83bb728f541c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,6 +7,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
#include "AMDKernelCodeT.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -27,6 +30,12 @@ public:
StringRef ArchName) = 0;
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+ virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+ virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+ virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
};
class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -41,6 +50,12 @@ public:
StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
};
class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -72,6 +87,12 @@ public:
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
};
}
+#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 8306a051ff98..c823ee7e0080 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -2,6 +2,7 @@
add_llvm_library(LLVMAMDGPUDesc
AMDGPUAsmBackend.cpp
AMDGPUELFObjectWriter.cpp
+ AMDGPUELFStreamer.cpp
AMDGPUMCCodeEmitter.cpp
AMDGPUMCTargetDesc.cpp
AMDGPUMCAsmInfo.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
index 4217bb362975..aa9a02198d04 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = AMDGPUDesc
parent = AMDGPU
-required_libraries = MC AMDGPUAsmPrinter AMDGPUInfo Support
+required_libraries = MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support
add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index e683498d52a5..3c1142dd664b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCRegisterInfo &MRI;
public:
-
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
: MCII(mcii), MRI(mri) { }
@@ -50,8 +49,8 @@ public:
uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
-private:
+private:
void EmitByte(unsigned int byte, raw_ostream &OS) const;
void Emit(uint32_t value, raw_ostream &OS) const;
@@ -59,7 +58,6 @@ private:
unsigned getHWRegChan(unsigned reg) const;
unsigned getHWReg(unsigned regNo) const;
-
};
} // End anonymous namespace
@@ -83,7 +81,7 @@ enum FCInstr {
MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- MCContext &Ctx) {
+ MCContext &Ctx) {
return new R600MCCodeEmitter(MCII, MRI);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeba2b16..9eb3dadbc5e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
void operator=(const SIMCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
- MCContext &Ctx;
/// \brief Can this operand also contain immediate values?
bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
public:
SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
MCContext &ctx)
- : MCII(mcii), MRI(mri), Ctx(ctx) { }
+ : MCII(mcii), MRI(mri) { }
~SIMCCodeEmitter() override {}
@@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
if (MO.isExpr()) {
const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
- MCFixupKind Kind;
- const MCSymbol *Sym =
- Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
- if (&Expr->getSymbol() == Sym) {
- // Add the offset to the beginning of the constant values.
- Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
- } else {
- // This is used for constant data stored in .rodata.
- Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
- }
+ MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index d9a0723bedc9..a1584a224cbd 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
>;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index c8f37f61fc16..bd80bb211b4f 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -405,8 +405,8 @@ private:
if (MO.isReg() && MO.isInternalRead())
MO.setIsInternalRead(false);
}
- getLiteral(BI, Literals);
- ClauseContent.push_back(BI);
+ getLiteral(&*BI, Literals);
+ ClauseContent.push_back(&*BI);
}
I = BI;
DeleteMI->eraseFromParent();
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 4e4d554f0ee7..124a9c6e0f56 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
setSchedulingPreference(Sched::Source);
}
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+ return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineFunction * MF = BB->getParent();
@@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
- unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(1))
- .addImm(EOP); // Set End of program bit
+ .addImm(isEOP(I)); // Set End of program bit
+ break;
+ }
+ case AMDGPU::RAT_STORE_TYPED_eg: {
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addOperand(MI->getOperand(2))
+ .addImm(isEOP(I)); // Set End of program bit
break;
}
@@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
}
}
}
- bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+ bool EOP = isEOP(I);
if (!EOP && !isLastInstructionOfItsType)
return BB;
unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
@@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
SDLoc DL(Op);
+
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FADD, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
@@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
Arg->getOperand(0).getOperand(Element));
}
}
+ break;
}
case ISD::SELECT_CC: {
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 855fa9fe45b2..8b6eea17130b 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -922,7 +922,7 @@ bool
R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCyles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const{
+ BranchProbability Probability) const{
return true;
}
@@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB,
unsigned NumFCycles,
unsigned ExtraFCycles,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
return true;
}
bool
R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
unsigned NumCyles,
- const BranchProbability &Probability)
+ BranchProbability Probability)
const {
return true;
}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index dee4c2b9ae31..e7251c31107b 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -174,18 +174,18 @@ namespace llvm {
bool
isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override ;
+ BranchProbability Probability) const override ;
bool
isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumTCycles, unsigned ExtraTCycles,
MachineBasicBlock &FMBB,
unsigned NumFCycles, unsigned ExtraFCycles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool DefinesPredicate(MachineInstr *MI,
std::vector<MachineOperand> &Pred) const override;
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 7beed092b3f7..33ef6a4e19ea 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
// ISel Patterns
//===----------------------------------------------------------------------===//
-// CND*_INT Pattterns for f32 True / False values
+// CND*_INT Patterns for f32 True / False values
class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
(selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 0c06ccc736d0..5efb3b9fc20e 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
MRI = &(Fn.getRegInfo());
for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
MBB != MBBe; ++MBB) {
- MachineBasicBlock *MB = MBB;
+ MachineBasicBlock *MB = &*MBB;
PreviousRegSeq.clear();
PreviousRegSeqByReg.clear();
PreviousRegSeqByUndefCount.clear();
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index deee5bc39974..21269613a305 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -81,11 +81,11 @@ private:
int LastDstChan = -1;
do {
bool isTrans = false;
- int BISlot = getSlot(BI);
+ int BISlot = getSlot(&*BI);
if (LastDstChan >= BISlot)
isTrans = true;
LastDstChan = BISlot;
- if (TII->isPredicated(BI))
+ if (TII->isPredicated(&*BI))
continue;
int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +95,7 @@ private:
continue;
}
unsigned Dst = BI->getOperand(DstIdx).getReg();
- if (isTrans || TII->isTransOnly(BI)) {
+ if (isTrans || TII->isTransOnly(&*BI)) {
Result[Dst] = AMDGPU::PS;
continue;
}
@@ -149,7 +149,7 @@ private:
public:
// Ctor.
R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
- : VLIWPacketizerList(MF, MLI, true),
+ : VLIWPacketizerList(MF, MLI, nullptr),
TII(static_cast<const R600InstrInfo *>(
MF.getSubtarget().getInstrInfo())),
TRI(TII->getRegisterInfo()) {
@@ -162,14 +162,14 @@ public:
}
// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
- bool ignorePseudoInstruction(MachineInstr *MI,
- MachineBasicBlock *MBB) override {
+ bool ignorePseudoInstruction(const MachineInstr *MI,
+ const MachineBasicBlock *MBB) override {
return false;
}
// isSoloInstruction - return true if instruction MI can not be packetized
// with any other instruction, which means that MI itself is a packet.
- bool isSoloInstruction(MachineInstr *MI) override {
+ bool isSoloInstruction(const MachineInstr *MI) override {
if (TII->isVector(*MI))
return true;
if (!TII->isALUInstr(MI->getOpcode()))
@@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
// instruction stream until we find the nearest boundary.
MachineBasicBlock::iterator I = RegionEnd;
for(;I != MBB->begin(); --I, --RemainingCount) {
- if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+ if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
break;
}
I = MBB->begin();
@@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
continue;
}
- Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+ Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
RegionEnd = I;
}
}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index 9713e600a721..4f8a129ce4a6 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
- const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+ const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
const RegClassWeight &
getRegClassWeight(const TargetRegisterClass *RC) const override;
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index ccfbf1bf19ed..fa4d24a2f25a 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
Preds.push_back(*PI);
}
- BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
- LI, false);
+ BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
}
- CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+ CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
}
/// \brief Annotate the control flow with intrinsics so the backend can
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 4c3263911c40..7f79dd34f3ba 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -37,7 +37,8 @@ enum {
MIMG = 1 << 18,
FLAT = 1 << 19,
WQM = 1 << 20,
- VGPRSpill = 1 << 21
+ VGPRSpill = 1 << 21,
+ VOPAsmPrefer32Bit = 1 << 22
};
}
diff --git a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
index 5fe8d19426dd..636750dcfba2 100644
--- a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -16,15 +16,9 @@
#include "AMDGPU.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 23502b45905c..96e37c566240 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -82,22 +82,10 @@ using namespace llvm;
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
-
-private:
+public:
static char ID;
- const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const;
- const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const;
- bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI) const;
-public:
- SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+ SIFixSGPRCopies() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -105,14 +93,23 @@ public:
return "SI Fix SGPR copies";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
+INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+
char SIFixSGPRCopies::ID = 0;
-FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
- return new SIFixSGPRCopies(tm);
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+ return new SIFixSGPRCopies();
}
static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
@@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
return false;
}
-/// This functions walks the use list of Reg until it finds an Instruction
-/// that isn't a COPY returns the register class of that instruction.
-/// \return The register defined by the first non-COPY instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const {
-
- const TargetRegisterClass *RC
- = TargetRegisterInfo::isVirtualRegister(Reg) ?
- MRI.getRegClass(Reg) :
- TRI->getPhysRegClass(Reg);
-
- RC = TRI->getSubRegClass(RC, SubReg);
- for (MachineRegisterInfo::use_instr_iterator
- I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
- switch (I->getOpcode()) {
- case AMDGPU::COPY:
- RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
- I->getOperand(0).getReg(),
- I->getOperand(0).getSubReg()));
- break;
- }
- }
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+ const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+ unsigned DstReg = Copy.getOperand(0).getReg();
+ unsigned SrcReg = Copy.getOperand(1).getReg();
+
+ const TargetRegisterClass *SrcRC =
+ TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+ MRI.getRegClass(SrcReg) :
+ TRI.getPhysRegClass(SrcReg);
- return RC;
+ // We don't really care about the subregister here.
+ // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+ const TargetRegisterClass *DstRC =
+ TargetRegisterInfo::isVirtualRegister(DstReg) ?
+ MRI.getRegClass(DstReg) :
+ TRI.getPhysRegClass(DstReg);
+
+ return std::make_pair(SrcRC, DstRC);
}
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
- return TRI->getSubRegClass(RC, SubReg);
- }
- MachineInstr *Def = MRI.getVRegDef(Reg);
- if (Def->getOpcode() != AMDGPU::COPY) {
- return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
- }
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
- return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
- Def->getOperand(1).getSubReg());
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
}
-bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI) const {
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI) {
+ assert(MI.isRegSequence());
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+ return false;
- unsigned DstReg = Copy.getOperand(0).getReg();
- unsigned SrcReg = Copy.getOperand(1).getReg();
- unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+ if (!MRI.hasOneUse(DstReg))
+ return false;
- if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
- // If the destination register is a physical register there isn't really
- // much we can do to fix this.
+ MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+ if (!CopyUse.isCopy())
return false;
- }
- const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
- const TargetRegisterClass *SrcRC;
+ if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+ return false;
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+ // TODO: Could have multiple extracts?
+ unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+ if (SubReg != AMDGPU::NoSubRegister)
return false;
- SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
- return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+ MRI.setRegClass(DstReg, DstRC);
+
+ // SGPRx = ...
+ // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+ // VGPRz = COPY SGPRy
+
+ // =>
+ // VGPRx = COPY SGPRx
+ // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+ MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+ for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+ unsigned SrcReg = MI.getOperand(I).getReg();
+ unsigned SrcSubReg = MI.getOperand(I).getReg();
+
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ assert(TRI->isSGPRClass(SrcRC) &&
+ "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+ SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+ const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+ unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+ .addOperand(MI.getOperand(I));
+
+ MI.getOperand(I).setReg(TmpReg);
+ }
+
+ CopyUse.eraseFromParent();
+ return true;
}
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
@@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ SmallVector<MachineInstr *, 16> Worklist;
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
+ I != E; ++I) {
MachineInstr &MI = *I;
- if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
- DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
- DEBUG(MI.print(dbgs()));
- TII->moveToVALU(MI);
-
- }
switch (MI.getOpcode()) {
- default: continue;
- case AMDGPU::PHI: {
- DEBUG(dbgs() << "Fixing PHI: " << MI);
-
- for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
- const MachineOperand &Op = MI.getOperand(i);
- unsigned Reg = Op.getReg();
- const TargetRegisterClass *RC
- = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+ default:
+ continue;
+ case AMDGPU::COPY: {
+ // If the destination register is a physical register there isn't really
+ // much we can do to fix this.
+ if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+ continue;
- MRI.constrainRegClass(Op.getReg(), RC);
- }
- unsigned Reg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
- MI.getOperand(0).getSubReg());
- if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
- MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+ DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+ TII->moveToVALU(MI);
}
+ break;
+ }
+ case AMDGPU::PHI: {
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
+ unsigned Reg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
@@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
case AMDGPU::REG_SEQUENCE: {
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
- !hasVGPROperands(MI, TRI))
+ !hasVGPROperands(MI, TRI)) {
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
+ }
DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
index 0c54446b0fb1..8bda283f0fca 100644
--- a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
@@ -7,9 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file
-/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define in some cases.
+/// \file SALU instructions ignore the execution mask, so we need to modify the
+/// live ranges of the registers they define in some cases.
///
/// The main case we need to handle is when a def is used in one side of a
/// branch and not another. For example:
@@ -42,13 +41,15 @@
/// ENDIF
/// %use
///
-/// Adding this use will make the def live thoughout the IF branch, which is
+/// Adding this use will make the def live throughout the IF branch, which is
/// what we want.
#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachinePostDominators.h"
@@ -79,9 +80,13 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<LiveVariables>();
+ AU.addPreserved<LiveVariables>();
+
AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
AU.setPreservesCFG();
+
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -90,7 +95,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
@@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
- LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
- std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+ bool MadeChange = false;
+
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+ SmallVector<unsigned, 16> SGPRLiveRanges;
+
+ LiveVariables *LV = &getAnalysis<LiveVariables>();
+ MachineBasicBlock *Entry = &MF.front();
- // First pass, collect all live intervals for SGPRs
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
+ // Use a depth first order so that in SSA, we encounter all defs before
+ // uses. Once the defs of the block have been found, attempt to insert
+ // SGPR_USE instructions in successor blocks if required.
+ for (MachineBasicBlock *MBB : depth_first(Entry)) {
+ for (const MachineInstr &MI : *MBB) {
for (const MachineOperand &MO : MI.defs()) {
- if (MO.isImplicit())
- continue;
+ // We should never see a live out def of a physical register, so we also
+ // do not need to worry about implicit_defs().
unsigned Def = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(Def)) {
- if (TRI->isSGPRClass(MRI.getRegClass(Def)))
- SGPRLiveRanges.push_back(
- std::make_pair(Def, &LIS->getInterval(Def)));
- } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
- SGPRLiveRanges.push_back(
- std::make_pair(Def, &LIS->getRegUnit(Def)));
+ if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
+ // Only consider defs that are live outs. We don't care about def /
+ // use within the same block.
+
+ // LiveVariables does not consider registers that are only used in a
+ // phi in a sucessor block as live out, unlike LiveIntervals.
+ //
+ // This is OK because SIFixSGPRCopies replaced any SGPR phis with
+ // VGPRs.
+ if (LV->isLiveOut(Def, *MBB))
+ SGPRLiveRanges.push_back(Def);
+ }
}
}
}
- }
- // Second pass fix the intervals
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
- if (MBB.succ_size() < 2)
+ if (MBB->succ_size() < 2)
continue;
- // We have structured control flow, so number of succesors should be two.
- assert(MBB.succ_size() == 2);
- MachineBasicBlock *SuccA = *MBB.succ_begin();
- MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+ // We have structured control flow, so the number of successors should be
+ // two.
+ assert(MBB->succ_size() == 2);
+ MachineBasicBlock *SuccA = *MBB->succ_begin();
+ MachineBasicBlock *SuccB = *(++MBB->succ_begin());
MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
if (!NCD)
@@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
*(++NCD->succ_begin()));
}
- assert(SuccA && SuccB);
- for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
- unsigned Reg = RegLR.first;
- LiveRange *LR = RegLR.second;
-
- // FIXME: We could be smarter here. If the register is Live-In to
- // one block, but the other doesn't have any SGPR defs, then there
- // won't be a conflict. Also, if the branch decision is based on
- // a value in an SGPR, then there will be no conflict.
- bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
- bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
-
- if ((!LiveInToA && !LiveInToB) ||
- (LiveInToA && LiveInToB))
+
+ for (unsigned Reg : SGPRLiveRanges) {
+ // FIXME: We could be smarter here. If the register is Live-In to one
+ // block, but the other doesn't have any SGPR defs, then there won't be a
+ // conflict. Also, if the branch condition is uniform then there will be
+ // no conflict.
+ bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
+ bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
+
+ if (!LiveInToA && !LiveInToB) {
+ DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+ << " is live into neither successor\n");
continue;
+ }
+
+ if (LiveInToA && LiveInToB) {
+ DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+ << " is live into both successors\n");
+ continue;
+ }
// This interval is live in to one successor, but not the other, so
// we need to update its range so it is live in to both.
- DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR <<
- " BB#" << SuccA->getNumber() << ", BB#" <<
- SuccB->getNumber() <<
- " with NCD = " << NCD->getNumber() << '\n');
+ DEBUG(dbgs() << "Possible SGPR conflict detected for "
+ << PrintReg(Reg, TRI, 0)
+ << " BB#" << SuccA->getNumber()
+ << ", BB#" << SuccB->getNumber()
+ << " with NCD = BB#" << NCD->getNumber() << '\n');
+
+ assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+ "Not expecting to extend live range of physreg");
// FIXME: Need to figure out how to update LiveRange here so this pass
// will be able to preserve LiveInterval analysis.
- BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
- TII->get(AMDGPU::SGPR_USE))
- .addReg(Reg, RegState::Implicit);
- DEBUG(NCD->getFirstNonPHI()->dump());
+ MachineInstr *NCDSGPRUse =
+ BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::SGPR_USE))
+ .addReg(Reg, RegState::Implicit);
+
+ MadeChange = true;
+ LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
+
+ DEBUG(NCDSGPRUse->dump());
}
}
- return false;
+ return MadeChange;
}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index c2887255cc11..02a39307e74e 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -45,6 +45,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
- unsigned CommuteIdx0;
- unsigned CommuteIdx1;
+ unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+ unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
if (CanCommute) {
@@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
OpNo = CommuteIdx0;
}
- if (!CanCommute || !TII->commuteInstruction(MI))
+ // One of operands might be an Imm operand, and OpNo may refer to it after
+ // the call of commuteInstruction() below. Such situations are avoided
+ // here explicitly as OpNo must be a register operand to be a candidate
+ // for memory folding.
+ if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+ !MI->getOperand(CommuteIdx1).isReg()))
+ return false;
+
+ if (!CanCommute ||
+ !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
return false;
if (!TII->isOperandLegal(MI, OpNo, OpToFold))
@@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
return true;
}
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ std::vector<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+ const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+ MachineRegisterInfo &MRI) {
+ const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+ // FIXME: Fold operands with subregs.
+ if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+ UseOp.isImplicit())) {
+ return;
+ }
+
+ bool FoldingImm = OpToFold.isImm();
+ APInt Imm;
+
+ if (FoldingImm) {
+ unsigned UseReg = UseOp.getReg();
+ const TargetRegisterClass *UseRC
+ = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+ MRI.getRegClass(UseReg) :
+ TRI.getPhysRegClass(UseReg);
+
+ Imm = APInt(64, OpToFold.getImm());
+
+ const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
+ const TargetRegisterClass *FoldRC =
+ TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
+ if (UseRC->getSize() != 8)
+ return;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (UseMI->getOpcode() == AMDGPU::COPY) {
+ unsigned DestReg = UseMI->getOperand(0).getReg();
+ const TargetRegisterClass *DestRC
+ = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI.getRegClass(DestReg) :
+ TRI.getPhysRegClass(DestReg);
+
+ unsigned MovOp = TII->getMovOpcode(DestRC);
+ if (MovOp == AMDGPU::COPY)
+ return;
+
+ UseMI->setDesc(TII->get(MovOp));
+ CopiesToReplace.push_back(UseMI);
+ }
+ }
+
+ // Special case for REG_SEQUENCE: We can't fold literals into
+ // REG_SEQUENCE instructions, so we have to fold them into the
+ // uses of REG_SEQUENCE.
+ if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+ for (MachineRegisterInfo::use_iterator
+ RSUse = MRI.use_begin(RegSeqDstReg),
+ RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
+
+ MachineInstr *RSUseMI = RSUse->getParent();
+ if (RSUse->getSubReg() != RegSeqDstSubReg)
+ continue;
+
+ foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ }
+ return;
+ }
+
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+ return;
+
+ if (FoldingImm) {
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+ return;
+ }
+
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ return;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII =
@@ -226,88 +340,36 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
OpToFold.getSubReg()))
continue;
+
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+
std::vector<FoldCandidate> FoldList;
for (MachineRegisterInfo::use_iterator
Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
Use != E; ++Use) {
MachineInstr *UseMI = Use->getParent();
- const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
- // FIXME: Fold operands with subregs.
- if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
- UseOp.isImplicit())) {
- continue;
- }
-
- APInt Imm;
-
- if (FoldingImm) {
- unsigned UseReg = UseOp.getReg();
- const TargetRegisterClass *UseRC
- = TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI.getRegClass(UseReg) :
- TRI.getPhysRegClass(UseReg);
-
- Imm = APInt(64, OpToFold.getImm());
-
- // Split 64-bit constants into 32-bits for folding.
- if (UseOp.getSubReg()) {
- if (UseRC->getSize() != 8)
- continue;
-
- if (UseOp.getSubReg() == AMDGPU::sub0) {
- Imm = Imm.getLoBits(32);
- } else {
- assert(UseOp.getSubReg() == AMDGPU::sub1);
- Imm = Imm.getHiBits(32);
- }
- }
-
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
- if (UseMI->getOpcode() == AMDGPU::COPY) {
- unsigned DestReg = UseMI->getOperand(0).getReg();
- const TargetRegisterClass *DestRC
- = TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI.getRegClass(DestReg) :
- TRI.getPhysRegClass(DestReg);
-
- unsigned MovOp = TII->getMovOpcode(DestRC);
- if (MovOp == AMDGPU::COPY)
- continue;
-
- UseMI->setDesc(TII->get(MovOp));
- }
- }
-
- const MCInstrDesc &UseDesc = UseMI->getDesc();
-
- // Don't fold into target independent nodes. Target independent opcodes
- // don't have defined register classes.
- if (UseDesc.isVariadic() ||
- UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
- continue;
-
- if (FoldingImm) {
- MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
- tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
- continue;
- }
-
- tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
-
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunites. The shrink operands pass
- // already does this.
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
}
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(MF);
+
for (FoldCandidate &Fold : FoldList) {
if (updateOperand(Fold, TRI)) {
// Clear kill flags.
if (!Fold.isImm()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
- Fold.OpToFold->setIsKill(false);
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI.clearKillFlags(Fold.OpToFold->getReg());
}
DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 000000000000..6b3c81c3af74
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,243 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+ const MachineFrameInfo *FrameInfo) {
+ if (!FuncInfo->hasSpilledSGPRs())
+ return false;
+
+ if (FuncInfo->hasSpilledVGPRs())
+ return false;
+
+ for (int I = FrameInfo->getObjectIndexBegin(),
+ E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+ if (!FrameInfo->isSpillSlotObjectIndex(I))
+ return false;
+ }
+
+ return true;
+}
+
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+ return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+ AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+ AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ if (!MF.getFrameInfo()->hasStackObjects())
+ return;
+
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If we only have SGPR spills, we won't actually be using scratch memory
+ // since these spill to VGPRs.
+ //
+ // FIXME: We should be cleaning up these unused SGPR spill frame indices
+ // somewhere.
+ if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+ return;
+
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+ unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+ if (ST.isAmdHsaOS()) {
+ PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ }
+
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
+ if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+ // We should always reserve these 5 registers at the same time.
+ assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+ "scratch wave offset and private segment buffer inconsistent");
+ return;
+ }
+
+
+ // We added live-ins during argument lowering, but since they were not used
+ // they were deleted. We're adding the uses now, so add them back.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+ if (ST.isAmdHsaOS()) {
+ MRI.addLiveIn(PreloadedPrivateBufferReg);
+ MBB.addLiveIn(PreloadedPrivateBufferReg);
+ }
+
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+ if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ ScratchRsrcReg = Reg;
+ MFI->setScratchRSrcReg(ScratchRsrcReg);
+ break;
+ }
+ }
+ }
+
+ if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ ScratchWaveOffsetReg = Reg;
+ MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ break;
+ }
+ }
+ }
+
+
+ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ MachineBasicBlock::iterator I = MBB.begin();
+ DebugLoc DL;
+
+ if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ // Make sure we emit the copy for the offset first. We may have chosen to copy
+ // the buffer resource into a register that aliases the input offset register.
+ BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+ .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+ }
+
+ if (ST.isAmdHsaOS()) {
+ // Insert copies from argument register.
+ assert(
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+ unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+ unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+ const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+ BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+ .addReg(Lo, RegState::Kill);
+ BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+ .addReg(Hi, RegState::Kill);
+ } else {
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+ // Use relocations to get the pointer, and setup the other bits manually.
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+
+ // Make the register selected live throughout the function.
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB == &MBB)
+ continue;
+
+ OtherBB.addLiveIn(ScratchRsrcReg);
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+ }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ if (!MFI->hasStackObjects())
+ return;
+
+ bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
+
+ assert((RS || !MayNeedScavengingEmergencySlot) &&
+ "RegScavenger required if spilling");
+
+ if (MayNeedScavengingEmergencySlot) {
+ int ScavengeFI = MFI->CreateSpillStackObject(
+ AMDGPU::SGPR_32RegClass.getSize(),
+ AMDGPU::SGPR_32RegClass.getAlignment());
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 000000000000..a9152fd8b2aa
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,34 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+ SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1) :
+ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ ~SIFrameLowering() override {}
+
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+
+ void processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c2db9ff537e9..0e043cb47da7 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,6 +20,7 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
@@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
@@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
@@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
for (MVT VT : MVT::fp_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+ for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch(Op) {
case ISD::LOAD:
@@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
+ case ISD::SCALAR_TO_VECTOR:
break;
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
@@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
}
}
+ // Most operations are naturally 32-bit vector operations. We only support
+ // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+ for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+ }
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -261,6 +305,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
}
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+ // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+ // additionally can do r + r + i with addr64. 32-bit has more addressing
+ // mode options. Depending on the resource constant, it can also do
+ // (i64 r0) + (i32 r1) * (i14 i).
+ //
+ // Private arrays end up using a scratch buffer most of the time, so also
+ // assume those use MUBUF instructions. Scratch loads / stores are currently
+ // implemented as mubuf instructions with offen bit set, so slightly
+ // different than the normal addr64.
+ if (!isUInt<12>(AM.BaseOffs))
+ return false;
+
+ // FIXME: Since we can split immediate into soffset and immediate offset,
+ // would it make sense to allow any immediate?
+
+ switch (AM.Scale) {
+ case 0: // r + i or just i, depending on HasBaseReg.
+ return true;
+ case 1:
+ return true; // We have r + r or r + i.
+ case 2:
+ if (AM.HasBaseReg) {
+ // Reject 2 * r + r.
+ return false;
+ }
+
+ // Allow 2 * r as r + r
+ // Or 2 * r + i is allowed as r + r + i.
+ return true;
+ default: // Don't allow n * r
+ return false;
+ }
+}
+
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
@@ -269,7 +348,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
switch (AS) {
- case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS: {
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
@@ -282,51 +361,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// because it has never been validated.
return isLegalFlatAddressingMode(AM);
}
- // fall-through
- case AMDGPUAS::PRIVATE_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
- case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
- // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
- // additionally can do r + r + i with addr64. 32-bit has more addressing
- // mode options. Depending on the resource constant, it can also do
- // (i64 r0) + (i32 r1) * (i14 i).
- //
- // SMRD instructions have an 8-bit, dword offset.
- //
- // Assume nonunifom access, since the address space isn't enough to know
- // what instruction we will use, and since we don't know if this is a load
- // or store and scalar stores are only available on VI.
- //
- // We also know if we are doing an extload, we can't do a scalar load.
- //
- // Private arrays end up using a scratch buffer most of the time, so also
- // assume those use MUBUF instructions. Scratch loads / stores are currently
- // implemented as mubuf instructions with offen bit set, so slightly
- // different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
- return false;
- // FIXME: Since we can split immediate into soffset and immediate offset,
- // would it make sense to allow any immediate?
+ return isLegalMUBUFAddressingMode(AM);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // If the offset isn't a multiple of 4, it probably isn't going to be
+ // correctly aligned.
+ if (AM.BaseOffs % 4 != 0)
+ return isLegalMUBUFAddressingMode(AM);
+
+ // There are no SMRD extloads, so if we have to do a small type access we
+ // will use a MUBUF load.
+ // FIXME?: We also need to do this if unaligned, but we don't know the
+ // alignment here.
+ if (DL.getTypeStoreSize(Ty) < 4)
+ return isLegalMUBUFAddressingMode(AM);
+
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ // SMRD instructions have an 8-bit, dword offset on SI.
+ if (!isUInt<8>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+ // On CI+, this can also be a 32-bit literal constant offset. If it fits
+ // in 8-bits, it can use a smaller encoding.
+ if (!isUInt<32>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+ if (!isUInt<20>(AM.BaseOffs))
+ return false;
+ } else
+ llvm_unreachable("unhandled generation");
- switch (AM.Scale) {
- case 0: // r + i or just i, depending on HasBaseReg.
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
- case 1:
- return true; // We have r + r or r + i.
- case 2:
- if (AM.HasBaseReg) {
- // Reject 2 * r + r.
- return false;
- }
- // Allow 2 * r as r + r
- // Or 2 * r + i is allowed as r + r + i.
+ if (AM.Scale == 1 && AM.HasBaseReg)
return true;
- default: // Don't allow n * r
- return false;
- }
+
+ return false;
}
+
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ return isLegalMUBUFAddressingMode(AM);
+
case AMDGPUAS::LOCAL_ADDRESS:
case AMDGPUAS::REGION_ADDRESS: {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -374,7 +453,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
- return Align % 4 == 0;
+ bool AlignedBy4 = (Align % 4 == 0);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+ return AlignedBy4;
}
// Smaller than dword value must be aligned.
@@ -411,6 +493,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
return MVT::Other;
}
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+}
+
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers
+ if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
+ isa<GlobalValue>(Ptr))
+ return true;
+
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
+}
+
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -426,12 +534,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return TII->isInlineConstant(Imm);
}
-static EVT toIntegerVT(EVT VT) {
- if (VT.isVector())
- return VT.changeVectorElementTypeToInteger();
- return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDLoc SL, SDValue Chain,
unsigned Offset, bool Signed) const {
@@ -439,7 +541,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
- unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+ unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
@@ -455,30 +557,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
unsigned Align = DL.getABITypeAlignment(Ty);
- if (VT != MemVT && VT.isFloatingPoint()) {
- // Do an integer load and convert.
- // FIXME: This is mostly because load legalization after type legalization
- // doesn't handle FP extloads.
- assert(VT.getScalarType() == MVT::f32 &&
- MemVT.getScalarType() == MVT::f16);
-
- EVT IVT = toIntegerVT(VT);
- EVT MemIVT = toIntegerVT(MemVT);
- SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
- IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
- false, // isVolatile
- true, // isNonTemporal
- true, // isInvariant
- Align); // Alignment
- SDValue Ops[] = {
- DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
- Load.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, SL);
- }
-
ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (MemVT.isFloatingPoint())
+ ExtTy = ISD::EXTLOAD;
+
return DAG.getLoad(ISD::UNINDEXED, ExtTy,
VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
false, // isVolatile
@@ -497,8 +579,16 @@ SDValue SITargetLowering::LowerFormalArguments(
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+ const Function *Fn = MF.getFunction();
+ DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+ DAG.getContext()->diagnose(NoGraphicsHSA);
+ return SDValue();
+ }
- assert(CallConv == CallingConv::C);
+ // FIXME: We currently assume all calling conventions are kernels.
SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());
@@ -513,7 +603,7 @@ SDValue SITargetLowering::LowerFormalArguments(
assert((PSInputNum <= 15) && "Too many PS inputs!");
if (!Arg.Used) {
- // We can savely skip PS inputs
+ // We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
@@ -530,7 +620,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
- // NOT four or eigth.
+ // NOT four or eight.
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
@@ -556,41 +646,30 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
- // The pointer to the list of arguments is stored in SGPR0, SGPR1
- // The pointer to the scratch buffer is stored in SGPR2, SGPR3
- if (Info->getShaderType() == ShaderType::COMPUTE) {
- if (Subtarget->isAmdHsaOS())
- Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
- else
- Info->NumUserSGPRs = 4;
-
- unsigned InputPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
- unsigned InputPtrRegLo =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned InputPtrRegHi =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- unsigned ScratchPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
- unsigned ScratchPtrRegLo =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned ScratchPtrRegHi =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- CCInfo.AllocateReg(InputPtrRegLo);
- CCInfo.AllocateReg(InputPtrRegHi);
- CCInfo.AllocateReg(ScratchPtrRegLo);
- CCInfo.AllocateReg(ScratchPtrRegHi);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
- MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
- }
-
if (Info->getShaderType() == ShaderType::COMPUTE) {
getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
Splits);
}
+ // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+ if (Info->hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ CCInfo.AllocateReg(PrivateSegmentBufferReg);
+ }
+
+ if (Info->hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(DispatchPtrReg);
+ }
+
+ if (Info->hasKernargSegmentPtr()) {
+ unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(InputPtrReg);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
@@ -617,7 +696,7 @@ SDValue SITargetLowering::LowerFormalArguments(
Offset, Ins[i].Flags.isSExt());
Chains.push_back(Arg.getValue(1));
- const PointerType *ParamTy =
+ auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -678,10 +757,113 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- if (Info->getShaderType() != ShaderType::COMPUTE) {
- unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
- AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
- Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+
+ // Start adding system SGPRs.
+ if (Info->hasWorkGroupIDX()) {
+ unsigned Reg = Info->addWorkGroupIDX();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("work group id x is always enabled");
+
+ if (Info->hasWorkGroupIDY()) {
+ unsigned Reg = Info->addWorkGroupIDY();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDZ()) {
+ unsigned Reg = Info->addWorkGroupIDZ();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupInfo()) {
+ unsigned Reg = Info->addWorkGroupInfo();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg
+ = Info->addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+
+ // Now that we've figured out where the scratch register inputs are, see if
+ // should reserve the arguments and use them directly.
+
+ bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+ if (ST.isAmdHsaOS()) {
+ // TODO: Assume we will spill without optimizations.
+ if (HasStackObjects) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the HSA ABI, this will be the first 4 user SGPR
+ // inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ } else {
+ unsigned ReservedBufferReg
+ = TRI->reservedPrivateSegmentBufferReg(MF);
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ // We tentatively reserve the last registers (skipping the last two
+ // which may contain VCC). After register allocation, we'll replace
+ // these with the ones immediately after those which were really
+ // allocated. In the prologue copies will be inserted from the argument
+ // to these reserved registers.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ } else {
+ unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+
+ if (HasStackObjects) {
+ unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ } else {
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ }
+
+ if (Info->hasWorkItemIDX()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("workitem id x should always be enabled");
+
+ if (Info->hasWorkItemIDY()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDZ()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
}
if (Chains.empty())
@@ -693,27 +875,11 @@ SDValue SITargetLowering::LowerFormalArguments(
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
- MachineBasicBlock::iterator I = *MI;
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH:
return BB;
- case AMDGPU::SI_RegisterStorePseudo: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- MachineInstrBuilder MIB =
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
- Reg);
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
- MIB.addOperand(MI->getOperand(i));
-
- MI->eraseFromParent();
- break;
- }
}
return BB;
}
@@ -944,20 +1110,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
- SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
- SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(0, DL, MVT::i32));
- SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(1, DL, MVT::i32));
-
- SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrLo, GA);
- SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrHi, DAG.getConstant(0, DL, MVT::i32),
- SDValue(Lo.getNode(), 1));
- return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+ return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
}
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
@@ -977,6 +1131,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
// a glue result.
}
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+ SDValue Op,
+ MVT VT,
+ unsigned Offset) const {
+ SDLoc SL(Op);
+ SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+ DAG.getEntryNode(), Offset, false);
+ // The local size values will have the hi 16-bits as zero.
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+ DAG.getValueType(VT));
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -988,7 +1154,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc DL(Op);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Should this propagate fast-math-flags?
+
switch (IntrinsicID) {
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
case Intrinsic::r600_read_ngroups_x:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_X, false);
@@ -1008,37 +1180,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
case Intrinsic::r600_read_local_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_X);
case Intrinsic::r600_read_local_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Y);
case Intrinsic::r600_read_local_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::AMDGPU_read_workdim:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+ // Really only 2 bits.
+ return lowerImplicitZextParam(DAG, Op, MVT::i8,
+ getImplicitParameterOffset(MFI, GRID_DIM));
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
case Intrinsic::r600_read_tidig_y:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
@@ -1077,6 +1248,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(2, DL, MVT::i32), // P0
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case AMDGPUIntrinsic::SI_packf16:
+ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+ return DAG.getUNDEF(MVT::i32);
+ return Op;
case AMDGPUIntrinsic::SI_fs_interp: {
SDValue IJ = Op.getOperand(4);
SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
@@ -1092,6 +1267,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_interp_p1: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
+ case Intrinsic::amdgcn_interp_p2: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Glue);
+ }
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
@@ -1152,16 +1340,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
"Custom lowering for non-i32 vectors hasn't been implemented.");
unsigned NumElements = Op.getValueType().getVectorNumElements();
assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
switch (Load->getAddressSpace()) {
default: break;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ if (isMemOpUniform(Load))
+ break;
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requires ments as global and private
+ // loads.
+ //
+ // Fall-through
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
+ if (NumElements >= 8)
+ return SplitVectorLoad(Op, DAG);
+
// v4 loads are supported for private and global memory.
if (NumElements <= 4)
break;
// fall-through
case AMDGPUAS::LOCAL_ADDRESS:
- return ScalarizeVectorLoad(Op, DAG);
+ // If properly aligned, if we split we might be able to use ds_read_b64.
+ return SplitVectorLoad(Op, DAG);
}
}
@@ -1236,8 +1437,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
if (Unsafe) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
}
return SDValue();
@@ -1274,6 +1477,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ // TODO: Should this propagate fast-math-flags?
+
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1379,7 +1584,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
- return ScalarizeVectorStore(Op, DAG);
+ return SplitVectorStore(Op, DAG);
if (VT == MVT::i1)
return DAG.getTruncStore(Store->getChain(), DL,
@@ -1393,6 +1598,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
DAG.getConstantFP(0.5/M_PI, DL,
@@ -2125,9 +2331,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- TII->legalizeOperands(MI);
- if (TII->isMIMG(MI->getOpcode())) {
+ if (TII->isVOP3(MI->getOpcode())) {
+ // Make sure constant bus requirements are respected.
+ TII->legalizeOperandsVOP3(MRI, MI);
+ return;
+ }
+
+ if (TII->isMIMG(*MI)) {
unsigned VReg = MI->getOperand(0).getReg();
unsigned Writemask = MI->getOperand(1).getImm();
unsigned BitsSet = 0;
@@ -2169,53 +2380,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr) const {
const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
- // XXX - Workaround for moveToVALU not handling different register class
- // inserts for REG_SEQUENCE.
-
- // Build the half of the subregister with the constants.
- const SDValue Ops0[] = {
- DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, Ops0), 0);
-
- // Combine the constants and the pointer.
- const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
- SubRegHi,
- DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
- };
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
- const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
- };
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+ };
-#endif
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
-/// of the resource descriptor) to create an offset, which is added to the
-/// resource ponter.
+/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to
+/// the resource pointer.
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr,
@@ -2248,15 +2444,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
}
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
- return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
@@ -2274,13 +2461,41 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (Constraint == "r") {
- switch(VT.SimpleTy) {
- default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
- case MVT::i64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
- case MVT::i32:
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 's':
+ case 'r':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ }
+
+ case 'v':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ case 96:
+ return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ case 512:
+ return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ }
}
}
@@ -2301,3 +2516,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 's':
+ case 'v':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d84c32ec0092..e2f8cb19d6be 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
+ SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+ MVT VT, unsigned Offset) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
@@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
public:
SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
@@ -76,6 +80,9 @@ public:
bool MemcpyStrSrc,
MachineFunction &MF) const override;
+ bool isMemOpUniform(const SDNode *N) const;
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
@@ -112,13 +119,10 @@ public:
SDValue Ptr,
uint32_t RsrcDword1,
uint64_t RsrcDword2And3) const;
- MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr) const;
-
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
};
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 90a37f174682..821aada526c7 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -91,7 +91,8 @@ private:
bool isOpRelevant(MachineOperand &Op);
/// \brief Get register interval an operand affects.
- RegInterval getRegInterval(MachineOperand &Op);
+ RegInterval getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const;
/// \brief Handle instructions async components
void pushInstruction(MachineBasicBlock &MBB,
@@ -121,9 +122,13 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "SI insert wait instructions";
+ return "SI insert wait instructions";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -138,9 +143,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
}
Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-
- uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
- Counters Result;
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ Counters Result = { { 0, 0, 0 } };
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
@@ -151,15 +155,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {
- if (TII->isSMRD(MI.getOpcode())) {
-
- MachineOperand &Op = MI.getOperand(0);
- assert(Op.isReg() && "First LGKM operand must be a register!");
-
- unsigned Reg = Op.getReg();
- unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
- Result.Named.LGKM = Size > 4 ? 2 : 1;
-
+ if (TII->isSMRD(MI)) {
+
+ if (MI.getNumOperands() != 0) {
+ assert(MI.getOperand(0).isReg() &&
+ "First LGKM operand must be a register!");
+
+ // XXX - What if this is a write into a super register?
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+ unsigned Size = RC->getSize();
+ Result.Named.LGKM = Size > 4 ? 2 : 1;
+ } else {
+ // s_dcache_inv etc. do not have a a destination register. Assume we
+ // want a wait on these.
+ // XXX - What is the right value?
+ Result.Named.LGKM = 1;
+ }
} else {
// DS
Result.Named.LGKM = 1;
@@ -173,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
}
bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-
// Constants are always irrelevant
- if (!Op.isReg())
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
return false;
// Defines are always relevant
@@ -196,7 +206,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
// operand comes before the value operand and it may have
// multiple data operands.
- if (TII->isDS(MI.getOpcode())) {
+ if (TII->isDS(MI)) {
MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
if (Data && Op.isIdenticalTo(*Data))
return true;
@@ -224,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
return false;
}
-RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
-
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- return std::make_pair(0, 0);
-
- unsigned Reg = Op.getReg();
- unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const {
+ unsigned Size = RC->getSize();
assert(Size >= 4);
RegInterval Result;
- Result.first = TRI->getEncodingValue(Reg);
+ Result.first = TRI->getEncodingValue(Reg.getReg());
Result.second = Result.first + Size / 4;
return Result;
@@ -246,10 +251,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
// Get the hardware counter increments and sum them up
Counters Increment = getHwCounts(*I);
+ Counters Limit = ZeroCounts;
unsigned Sum = 0;
for (unsigned i = 0; i < 3; ++i) {
LastIssued.Array[i] += Increment.Array[i];
+ if (Increment.Array[i])
+ Limit.Array[i] = LastIssued.Array[i];
Sum += Increment.Array[i];
}
@@ -261,7 +269,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+ // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
// or SMEM clause, respectively.
//
// The temporary workaround is to break the clauses with S_NOP.
@@ -270,7 +278,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
// and destination registers don't overlap, e.g. this is illegal:
// r0 = load r2
// r2 = load r0
- if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+ if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
(LastOpcodeType == VMEM && Increment.Named.VM)) {
// Insert a NOP to break the clause.
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -278,7 +286,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
LastInstWritesM0 = false;
}
- if (TII->isSMRD(I->getOpcode()))
+ if (TII->isSMRD(*I))
LastOpcodeType = SMEM;
else if (Increment.Named.VM)
LastOpcodeType = VMEM;
@@ -290,21 +298,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
}
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-
MachineOperand &Op = I->getOperand(i);
if (!isOpRelevant(Op))
continue;
- RegInterval Interval = getRegInterval(Op);
+ const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+ RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {
// Remember which registers we define
if (Op.isDef())
- DefinedRegs[j] = LastIssued;
+ DefinedRegs[j] = Limit;
// and which one we are using
if (Op.isUse())
- UsedRegs[j] = LastIssued;
+ UsedRegs[j] = Limit;
}
}
}
@@ -390,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
if (MI.getOpcode() == AMDGPU::S_SENDMSG)
return LastIssued;
- // For each register affected by this
- // instruction increase the result sequence
+ // For each register affected by this instruction increase the result
+ // sequence.
+ //
+ // TODO: We could probably just look at explicit operands if we removed VCC /
+ // EXEC from SMRD dest reg classes.
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-
MachineOperand &Op = MI.getOperand(i);
- RegInterval Interval = getRegInterval(Op);
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+ RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {
if (Op.isDef()) {
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 211666a9bdbc..0e883f64caa3 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
field bits<1> WQM = 0;
field bits<1> VGPRSpill = 0;
+ // This bit tells the assembler to use the 32-bit encoding in case it
+ // is unable to infer the encoding from the operands.
+ field bits<1> VOPAsmPrefer32Bit = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
@@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
let TSFlags{19} = FLAT;
let TSFlags{20} = WQM;
let TSFlags{21} = VGPRSpill;
+ let TSFlags{22} = VOPAsmPrefer32Bit;
- // Most instructions require adjustments after selection to satisfy
- // operand requirements.
- let hasPostISelHook = 1;
let SchedRW = [Write32Bit];
}
@@ -86,7 +88,6 @@ class Enc64 {
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-def VOPDstVCC : VOPDstOperand <VCCReg>;
let Uses = [EXEC] in {
@@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
}
class VOPCCommon <dag ins, string asm, list<dag> pattern> :
- VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
+ VOPAnyCommon <(outs), ins, asm, pattern> {
- let DisableEncoding = "$dst";
let VOPC = 1;
let Size = 4;
+ let Defs = [VCC];
}
class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
let isCodeGenOnly = 0;
int Size = 8;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
}
} // End Uses = [EXEC]
@@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
let Inst{31-27} = 0x18; //encoding
}
+class SMRD_IMMe_ci <bits<5> op> : Enc64 {
+ bits<7> sdst;
+ bits<7> sbase;
+ bits<32> offset;
+
+ let Inst{7-0} = 0xff;
+ let Inst{8} = 0;
+ let Inst{14-9} = sbase{6-1};
+ let Inst{21-15} = sdst;
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+ let Inst{63-32} = offset;
+}
+
let SchedRW = [WriteSALU] in {
class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
@@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern>, SOPCe <op> {
- let DisableEncoding = "$dst";
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
let SOPC = 1;
let isCodeGenOnly = 0;
+ let Defs = [SCC];
let UseNamedOperandTable = 1;
}
@@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
// Vector I/O operations
//===----------------------------------------------------------------------===//
-let Uses = [EXEC] in {
-
class DS <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
let LGKM_CNT = 1;
let DS = 1;
let UseNamedOperandTable = 1;
- let Uses = [M0];
+ let Uses = [M0, EXEC];
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
@@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MUBUF = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
@@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MTBUF = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
@@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MIMG = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0; // XXX ????
}
-
-
-} // End Uses = [EXEC]
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index cfd2c42d1aef..a08a5a8fed36 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
switch (MI->getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
return true;
default:
return false;
@@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
unsigned &Offset,
const TargetRegisterInfo *TRI) const {
unsigned Opc = LdSt->getOpcode();
- if (isDS(Opc)) {
+
+ if (isDS(*LdSt)) {
const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
AMDGPU::OpName::offset);
if (OffsetImm) {
@@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
return false;
}
- if (isMUBUF(Opc) || isMTBUF(Opc)) {
+ if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
return false;
@@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
return true;
}
- if (isSMRD(Opc)) {
+ if (isSMRD(*LdSt)) {
const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
AMDGPU::OpName::offset);
if (!OffsetImm)
@@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
MachineInstr *SecondLdSt,
unsigned NumLoads) const {
- unsigned Opc0 = FirstLdSt->getOpcode();
- unsigned Opc1 = SecondLdSt->getOpcode();
-
// TODO: This needs finer tuning
if (NumLoads > 4)
return false;
- if (isDS(Opc0) && isDS(Opc1))
+ if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
return true;
- if (isSMRD(Opc0) && isSMRD(Opc1))
+ if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
return true;
- if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+ if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
+ (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
return true;
return false;
@@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ };
+
+ static const int16_t Sub0_15_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
};
static const int16_t Sub0_7[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ };
+
+ static const int16_t Sub0_7_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
};
static const int16_t Sub0_3[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ };
+
+ static const int16_t Sub0_3_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
};
static const int16_t Sub0_2[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
};
static const int16_t Sub0_1[] = {
- AMDGPU::sub0, AMDGPU::sub1, 0
+ AMDGPU::sub0, AMDGPU::sub1,
};
unsigned Opcode;
- const int16_t *SubIndices;
+ ArrayRef<int16_t> SubIndices;
+ bool Forward;
if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
.addImm(0)
.addReg(SrcReg, getKillRegState(KillSrc));
}
@@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_3;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_3_64;
} else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_7;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_7_64;
} else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_15;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_15_64;
} else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
@@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
llvm_unreachable("Can't copy register!");
}
- while (unsigned SubIdx = *SubIndices++) {
+ if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
+ Forward = true;
+ else
+ Forward = false;
+
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ unsigned SubIdx;
+ if (Forward)
+ SubIdx = SubIndices[Idx];
+ else
+ SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIdx));
- Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+ Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+ if (Idx == SubIndices.size() - 1)
+ Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
- if (*SubIndices)
+ if (Idx == 0)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
}
}
@@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
return AMDGPU::COPY;
}
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill,
@@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- int Opcode = -1;
+
+ unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+ unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ Size, Align);
if (RI.isSGPRClass(RC)) {
+ MFI->setHasSpilledSGPRs();
+
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling
// SGPRs.
- switch (RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
- }
- } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
- MFI->setHasSpilledVGPRs();
-
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
- case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
- }
+ unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg) // src
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addMemOperand(MMO);
+
+ return;
}
- if (Opcode != -1) {
- FrameInfo->setObjectAlignment(FrameIndex, 4);
- BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg)
- .addFrameIndex(FrameIndex)
- // Place-holder registers, these will be filled in by
- // SIPrepareScratchRegs.
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- } else {
+ if (!ST.isVGPRSpillingEnabled(MFI)) {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
" spill register");
BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
- .addReg(SrcReg);
+ .addReg(SrcReg);
+
+ return;
+ }
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+ MFI->setHasSpilledVGPRs();
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg) // src
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
}
}
@@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- int Opcode = -1;
-
- if (RI.isSGPRClass(RC)){
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
- }
- } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
- case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
- }
- }
+ unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+ unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
- if (Opcode != -1) {
- FrameInfo->setObjectAlignment(FrameIndex, 4);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+ if (RI.isSGPRClass(RC)) {
+ // FIXME: Maybe this should not include a memoperand because it will be
+ // lowered to non-memory instructions.
+ unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex)
- // Place-holder registers, these will be filled in by
- // SIPrepareScratchRegs.
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addMemOperand(MMO);
- } else {
+ return;
+ }
+
+ if (!ST.isVGPRSpillingEnabled(MFI)) {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
" restore register");
BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+ return;
}
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addMemOperand(MMO);
}
/// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
if (MFI->getShaderType() == ShaderType::COMPUTE &&
WorkGroupSize > WavefrontSize) {
- unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
- unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
- unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+ unsigned TIDIGXReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+ unsigned TIDIGYReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+ unsigned TIDIGZReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
unsigned InputPtrReg =
- TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+ TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
Entry.addLiveIn(Reg);
}
RS->enterBasicBlock(&Entry);
+ // FIXME: Can we scavenge an SReg_64 and access the subregs?
unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
@@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
return TmpReg;
}
-void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
- int Count) const {
+void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+ int Count) const {
while (Count > 0) {
int Arg;
if (Count >= 8)
@@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
switch (MI->getOpcode()) {
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
- case AMDGPU::SI_CONSTDATA_PTR: {
- unsigned Reg = MI->getOperand(0).getReg();
- unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
- unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
- BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
- // Add 32-bit offset from this instruction to the start of the constant data.
- BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
- .addReg(RegLo)
- .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
- .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
- BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
- .addReg(RegHi)
- .addImm(0)
- .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
- .addReg(AMDGPU::SCC, RegState::Implicit);
- MI->eraseFromParent();
- break;
- }
case AMDGPU::SGPR_USE:
// This is just a placeholder for register allocation.
MI->eraseFromParent();
@@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MI->eraseFromParent();
break;
}
+
+ case AMDGPU::SI_CONSTDATA_PTR: {
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+ MachineFunction &MF = *MBB.getParent();
+ unsigned Reg = MI->getOperand(0).getReg();
+ unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+ unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+ // Create a bundle so these instructions won't be re-ordered by the
+ // post-RA scheduler.
+ MIBundleBuilder Bundler(MBB, MI);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+ // Add 32-bit offset from this instruction to the start of the
+ // constant data.
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+ .addReg(RegLo)
+ .addOperand(MI->getOperand(1)));
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi)
+ .addImm(0));
+
+ llvm::finalizeBundle(MBB, Bundler.begin());
+
+ MI->eraseFromParent();
+ break;
+ }
}
return true;
}
-MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
- bool NewMI) const {
-
- if (MI->getNumOperands() < 3)
- return nullptr;
-
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx0 and OpIdx1.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx0,
+ unsigned OpIdx1) const {
int CommutedOpcode = commuteOpcode(*MI);
if (CommutedOpcode == -1)
return nullptr;
int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src0);
- assert(Src0Idx != -1 && "Should always have src0 operand");
-
MachineOperand &Src0 = MI->getOperand(Src0Idx);
if (!Src0.isReg())
return nullptr;
int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src1);
- if (Src1Idx == -1)
+
+ if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
+ OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
+ (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
+ OpIdx1 != static_cast<unsigned>(Src0Idx)))
return nullptr;
MachineOperand &Src1 = MI->getOperand(Src1Idx);
- // Make sure it's legal to commute operands for VOP2.
- if (isVOP2(MI->getOpcode()) &&
- (!isOperandLegal(MI, Src0Idx, &Src1) ||
- !isOperandLegal(MI, Src1Idx, &Src0))) {
- return nullptr;
+
+ if (isVOP2(*MI)) {
+ const MCInstrDesc &InstrDesc = MI->getDesc();
+ // For VOP2 instructions, any operand type is valid to use for src0. Make
+ // sure we can use the src1 as src0.
+ //
+ // We could be stricter here and only allow commuting if there is a reason
+ // to do so. i.e. if both operands are VGPRs there is no real benefit,
+ // although MachineCSE attempts to find matches by commuting.
+ const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+ return nullptr;
}
if (!Src1.isReg()) {
// Allow commuting instructions with Imm operands.
if (NewMI || !Src1.isImm() ||
- (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+ (!isVOP2(*MI) && !isVOP3(*MI))) {
return nullptr;
}
-
// Be sure to copy the source modifiers to the right place.
if (MachineOperand *Src0Mods
= getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
Src1.ChangeToRegister(Reg, false);
Src1.setSubReg(SubReg);
} else {
- MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+ MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
}
if (MI)
@@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const {
+ unsigned &SrcOpIdx0,
+ unsigned &SrcOpIdx1) const {
const MCInstrDesc &MCID = MI->getDesc();
if (!MCID.isCommutable())
return false;
@@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
return false;
// FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
- // immediate.
+ // immediate. Also, immediate src0 operand is not handled in
+ // SIInstrInfo::commuteInstruction();
if (!MI->getOperand(Src0Idx).isReg())
return false;
@@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
if (Src1Idx == -1)
return false;
- if (!MI->getOperand(Src1Idx).isReg())
- return false;
-
- // If any source modifiers are set, the generic instruction commuting won't
- // understand how to copy the source modifiers.
- if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
- hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
+ if (Src1.isImm()) {
+ // SIInstrInfo::commuteInstruction() does support commuting the immediate
+ // operand src1 in 2 and 3 operand instructions.
+ if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+ return false;
+ } else if (Src1.isReg()) {
+ // If any source modifiers are set, the generic instruction commuting won't
+ // understand how to copy the source modifiers.
+ if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+ hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+ return false;
+ } else
return false;
- SrcOpIdx1 = Src0Idx;
- SrcOpIdx2 = Src1Idx;
- return true;
+ return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
}
MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
@@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const {
}
}
-bool
-SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
- return RC != &AMDGPU::EXECRegRegClass;
-}
-
static void removeModOperands(MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
}
- UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::src2));
- // ChangingToImmediate adds Src2 back to the instruction.
Src2->ChangeToImmediate(Imm);
removeModOperands(*UseMI);
@@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
return false;
}
-bool
-SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
- AliasAnalysis *AA) const {
- switch(MI->getOpcode()) {
- default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
- case AMDGPU::S_MOV_B32:
- case AMDGPU::S_MOV_B64:
- case AMDGPU::V_MOV_B32_e32:
- return MI->getOperand(1).isImm();
- }
-}
-
static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
int WidthB, int OffsetB) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
MachineInstr *MIb,
AliasAnalysis *AA) const {
- unsigned Opc0 = MIa->getOpcode();
- unsigned Opc1 = MIb->getOpcode();
-
assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
"MIa must load from or modify a memory location");
assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
@@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
- // underlying addres space, even if it was lowered to a different one,
+ // underlying address space, even if it was lowered to a different one,
// e.g. private accesses lowered to use MUBUF instructions on a scratch
// buffer.
- if (isDS(Opc0)) {
- if (isDS(Opc1))
+ if (isDS(*MIa)) {
+ if (isDS(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1);
+ return !isFLAT(*MIb);
}
- if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
- if (isMUBUF(Opc1) || isMTBUF(Opc1))
+ if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
+ if (isMUBUF(*MIb) || isMTBUF(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1) && !isSMRD(Opc1);
+ return !isFLAT(*MIb) && !isSMRD(*MIb);
}
- if (isSMRD(Opc0)) {
- if (isSMRD(Opc1))
+ if (isSMRD(*MIa)) {
+ if (isSMRD(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+ return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
}
- if (isFLAT(Opc0)) {
- if (isFLAT(Opc1))
+ if (isFLAT(*MIa)) {
+ if (isFLAT(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
return false;
@@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
return false;
}
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.implicit_operands()) {
+ // We only care about reads.
+ if (MO.isDef())
+ continue;
+
+ switch (MO.getReg()) {
+ case AMDGPU::VCC:
+ case AMDGPU::M0:
+ case AMDGPU::FLAT_SCR:
+ return MO.getReg();
+
+ default:
+ break;
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
@@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
return false;
}
- // Make sure the register classes are correct
+ // Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
if (MI->getOperand(i).isFPImm()) {
ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
@@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify VOP*
- if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+ if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
- unsigned SGPRUsed = AMDGPU::NoRegister;
+ unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+ if (SGPRUsed != AMDGPU::NoRegister)
+ ++ConstantBusCount;
+
for (int OpIdx : OpIndices) {
if (OpIdx == -1)
break;
@@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
}
}
+ // Make sure we aren't losing exec uses in the td files. This mostly requires
+ // being careful when using let Uses to try to add other use registers.
+ if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
+ const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
+ if (!Exec || !Exec->isImplicit()) {
+ ErrInfo = "VALU instruction does not implicitly read exec mask";
+ return false;
+ }
+ }
+
return true;
}
@@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
case AMDGPU::S_LOAD_DWORD_IMM:
- case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+ case AMDGPU::S_LOAD_DWORD_SGPR:
+ case AMDGPU::S_LOAD_DWORD_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
case AMDGPU::S_LOAD_DWORDX2_IMM:
- case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
unsigned SubIdx,
const TargetRegisterClass *SubRC)
const {
- assert(SuperReg.isReg());
-
- unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
unsigned SubReg = MRI.createVirtualRegister(SubRC);
+ if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+ .addReg(SuperReg.getReg(), 0, SubIdx);
+ return SubReg;
+ }
+
// Just in case the super register is itself a sub-register, copy it to a new
// value so we don't need to worry about merging its subreg index with the
// SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
- MachineBasicBlock *MBB = MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
.addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
return MachineOperand::CreateReg(SubReg, false);
}
-unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
- MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineOperand &Op) const {
- MachineBasicBlock *MBB = MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
- unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned Dst = MRI.createVirtualRegister(RC);
-
- MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- LoDst)
- .addImm(Op.getImm() & 0xFFFFFFFF);
- MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- HiDst)
- .addImm(Op.getImm() >> 32);
-
- BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
- .addReg(LoDst)
- .addImm(AMDGPU::sub0)
- .addReg(HiDst)
- .addImm(AMDGPU::sub1);
-
- Worklist.push_back(Lo);
- Worklist.push_back(Hi);
-
- return Dst;
-}
-
// Change the order of operands from (0, 1, 2) to (0, 2, 1)
void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
assert(Inst->getNumExplicitOperands() == 3);
@@ -1643,6 +1765,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
Inst->addOperand(Op1);
}
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RC =
+ TargetRegisterInfo::isVirtualRegister(Reg) ?
+ MRI.getRegClass(Reg) :
+ RI.getPhysRegClass(Reg);
+
+ // In order to be legal, the common sub-class must be equal to the
+ // class of the current operand. For example:
+ //
+ // v_mov_b32 s0 ; Operand defined as vsrc_32
+ // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+ //
+ // s_sendmsg 0, s0 ; Operand defined as m0reg
+ // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+ return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (MO.isReg())
+ return isLegalRegOperand(MRI, OpInfo, MO);
+
+ // Handle non-register types that are treated like immediates.
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ return true;
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1653,7 +1810,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
if (!MO)
MO = &MI->getOperand(OpIdx);
- if (isVALU(InstDesc.Opcode) &&
+ if (isVALU(*MI) &&
usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
unsigned SGPRUsed =
MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
@@ -1670,21 +1827,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
if (MO->isReg()) {
assert(DefinedRC);
- const TargetRegisterClass *RC =
- TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
- MRI.getRegClass(MO->getReg()) :
- RI.getPhysRegClass(MO->getReg());
-
- // In order to be legal, the common sub-class must be equal to the
- // class of the current operand. For example:
- //
- // v_mov_b32 s0 ; Operand defined as vsrc_32
- // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
- //
- // s_sendmsg 0, s0 ; Operand defined as m0reg
- // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
- return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+ return isLegalRegOperand(MRI, OpInfo, *MO);
}
@@ -1699,81 +1842,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
return isImmOperandLegal(MI, OpIdx, *MO);
}
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+ MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+ const MCInstrDesc &InstrDesc = get(Opc);
- int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src1);
- int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src2);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
- // Legalize VOP2
- if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
- // Legalize src0
- if (!isOperandLegal(MI, Src0Idx))
+ // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+ // we need to only have one constant bus use.
+ //
+ // Note we do not need to worry about literal constants here. They are
+ // disabled for the operand type for instructions because they will always
+ // violate the one constant bus use rule.
+ bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+ if (HasImplicitSGPR) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+ if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
legalizeOpWithMove(MI, Src0Idx);
+ }
- // Legalize src1
- if (isOperandLegal(MI, Src1Idx))
- return;
+ // VOP2 src0 instructions support all operand types, so we don't need to check
+ // their legality. If src1 is already legal, we don't need to do anything.
+ if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+ return;
- // Usually src0 of VOP2 instructions allow more types of inputs
- // than src1, so try to commute the instruction to decrease our
- // chances of having to insert a MOV instruction to legalize src1.
- if (MI->isCommutable()) {
- if (commuteInstruction(MI))
- // If we are successful in commuting, then we know MI is legal, so
- // we are done.
- return;
- }
+ // We do not use commuteInstruction here because it is too aggressive and will
+ // commute if it is possible. We only want to commute here if it improves
+ // legality. This can be called a fairly large number of times so don't waste
+ // compile time pointlessly swapping and checking legality again.
+ if (HasImplicitSGPR || !MI->isCommutable()) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI->getOperand(Src0Idx);
+ // If src0 can be used as src1, commuting will make the operands legal.
+ // Otherwise we have to give up and insert a move.
+ //
+ // TODO: Other immediate-like operand kinds could be commuted if there was a
+ // MachineOperand::ChangeTo* for them.
+ if ((!Src1.isImm() && !Src1.isReg()) ||
+ !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
legalizeOpWithMove(MI, Src1Idx);
return;
}
- // XXX - Do any VOP3 instructions read VCC?
- // Legalize VOP3
- if (isVOP3(MI->getOpcode())) {
- int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+ int CommutedOpc = commuteOpcode(*MI);
+ if (CommutedOpc == -1) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
- // Find the one SGPR operand we are allowed to use.
- unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+ MI->setDesc(get(CommutedOpc));
- for (unsigned i = 0; i < 3; ++i) {
- int Idx = VOP3Idx[i];
- if (Idx == -1)
- break;
- MachineOperand &MO = MI->getOperand(Idx);
+ unsigned Src0Reg = Src0.getReg();
+ unsigned Src0SubReg = Src0.getSubReg();
+ bool Src0Kill = Src0.isKill();
- if (MO.isReg()) {
- if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
- continue; // VGPRs are legal
+ if (Src1.isImm())
+ Src0.ChangeToImmediate(Src1.getImm());
+ else if (Src1.isReg()) {
+ Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+ Src0.setSubReg(Src1.getSubReg());
+ } else
+ llvm_unreachable("Should only have register or immediate operands");
- assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+ Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+ Src1.setSubReg(Src0SubReg);
+}
- if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
- SGPRReg = MO.getReg();
- // We can use one SGPR in each VOP3 instruction.
- continue;
- }
- } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
- // If it is not a register and not a literal constant, then it must be
- // an inline constant which is always legal.
- continue;
- }
- // If we make it this far, then the operand is not legal and we must
- // legalize it.
- legalizeOpWithMove(MI, Idx);
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(
+ MachineRegisterInfo &MRI,
+ MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+
+ int VOP3Idx[3] = {
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+ };
+
+ // Find the one SGPR operand we are allowed to use.
+ unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+ for (unsigned i = 0; i < 3; ++i) {
+ int Idx = VOP3Idx[i];
+ if (Idx == -1)
+ break;
+ MachineOperand &MO = MI->getOperand(Idx);
+
+ // We should never see a VOP3 instruction with an illegal immediate operand.
+ if (!MO.isReg())
+ continue;
+
+ if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+ continue; // VGPRs are legal
+
+ if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+ SGPRReg = MO.getReg();
+ // We can use one SGPR in each VOP3 instruction.
+ continue;
}
+
+ // If we make it this far, then the operand is not legal and we must
+ // legalize it.
+ legalizeOpWithMove(MI, Idx);
+ }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+ // Legalize VOP2
+ if (isVOP2(*MI)) {
+ legalizeOperandsVOP2(MRI, MI);
+ return;
+ }
+
+ // Legalize VOP3
+ if (isVOP3(*MI)) {
+ legalizeOperandsVOP3(MRI, MI);
+ return;
}
// Legalize REG_SEQUENCE and PHI
// The register class of the operands much be the same type as the register
// class of the output.
- if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
- MI->getOpcode() == AMDGPU::PHI) {
+ if (MI->getOpcode() == AMDGPU::PHI) {
const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
if (!MI->getOperand(i).isReg() ||
@@ -1802,26 +2007,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
// Update all the operands so they have the same type.
- for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
- if (!MI->getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI->getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
continue;
unsigned DstReg = MRI.createVirtualRegister(RC);
- MachineBasicBlock *InsertBB;
- MachineBasicBlock::iterator Insert;
- if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
- InsertBB = MI->getParent();
- Insert = MI;
- } else {
- // MI is a PHI instruction.
- InsertBB = MI->getOperand(i + 1).getMBB();
- Insert = InsertBB->getFirstTerminator();
+
+ // MI is a PHI instruction.
+ MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+ MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+ BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+ Op.setReg(DstReg);
+ }
+ }
+
+ // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+ // VGPR dest type and SGPR sources, insert copies so all operands are
+ // VGPRs. This seems to help operand folding / the register coalescer.
+ if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+ MachineBasicBlock *MBB = MI->getParent();
+ const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+ if (RI.hasVGPRs(DstRC)) {
+ // Update all the operands so they are VGPR register classes. These may
+ // not be the same register class because REG_SEQUENCE supports mixing
+ // subregister index types e.g. sub0_sub1 + sub2 + sub3
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI->getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+ const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+ if (VRC == OpRC)
+ continue;
+
+ unsigned DstReg = MRI.createVirtualRegister(VRC);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+
+ Op.setReg(DstReg);
+ Op.setIsKill();
}
- BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
- get(AMDGPU::COPY), DstReg)
- .addOperand(MI->getOperand(i));
- MI->getOperand(i).setReg(DstReg);
}
+
+ return;
}
// Legalize INSERT_SUBREG
@@ -1858,15 +2090,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
MachineBasicBlock &MBB = *MI->getParent();
- // Extract the ptr from the resource descriptor.
-
- // SRsrcPtrLo = srsrc:sub0
- unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
- // SRsrcPtrHi = srsrc:sub1
- unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+ // Extract the ptr from the resource descriptor.
+ unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
// Create an empty resource descriptor
unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1891,80 +2118,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
.addImm(RsrcDataFormat >> 32);
// NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewSRsrc)
- .addReg(Zero64)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SRsrcFormatLo)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcFormatHi)
- .addImm(AMDGPU::sub3);
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- unsigned NewVAddrLo;
- unsigned NewVAddrHi;
if (VAddr) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
- NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
- NewVAddrLo)
- .addReg(SRsrcPtrLo)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
- .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
-
- // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
- NewVAddrHi)
- .addReg(SRsrcPtrHi)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
- .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
- .addReg(AMDGPU::VCC, RegState::Implicit);
-
+ unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+ // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+ .addReg(NewVAddrLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(NewVAddrHi)
+ .addImm(AMDGPU::sub1);
} else {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
+ assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
+ < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ "FIXME: Need to emit flat atomics here");
+
MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-
- // Create the new instruction.
unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
- MachineInstr *Addr64 =
- BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
- .addOperand(*VData)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .addOperand(*SRsrc)
- .addOperand(*SOffset)
- .addOperand(*Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // tfe
+
+ // Atomics rith return have have an additional tied operand and are
+ // missing some of the special bits.
+ MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+ MachineInstr *Addr64;
+
+ if (!VDataIn) {
+ // Regular buffer load / store.
+ MachineInstrBuilder MIB
+ = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset);
+
+ // Atomics do not have this operand.
+ if (const MachineOperand *GLC
+ = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+ MIB.addImm(GLC->getImm());
+ }
+
+ MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+
+ if (const MachineOperand *TFE
+ = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+ MIB.addImm(TFE->getImm());
+ }
+
+ MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ Addr64 = MIB;
+ } else {
+ // Atomics with return.
+ Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addOperand(*VDataIn)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset)
+ .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ }
MI->removeFromParent();
MI = Addr64;
- NewVAddrLo = SRsrcPtrLo;
- NewVAddrHi = SRsrcPtrHi;
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addImm(AMDGPU::sub1);
+
VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
}
- // NewVaddr = {NewVaddrHi, NewVaddrLo}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewVAddr)
- .addReg(NewVAddrLo)
- .addImm(AMDGPU::sub0)
- .addReg(NewVAddrHi)
- .addImm(AMDGPU::sub1);
-
-
// Update the instruction to use NewVaddr
VAddr->setReg(NewVAddr);
// Update the instruction to use NewSRsrc
@@ -2028,53 +2287,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
.addOperand(*SOff);
unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
- .addOperand(*SOff)
- .addImm(HalfSize);
- Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+ .addReg(SOff->getReg(), 0, SOff->getSubReg())
+ .addImm(HalfSize);
+ Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
.addReg(SBase->getReg(), getKillRegState(IsKill),
SBase->getSubReg())
.addReg(OffsetSGPR);
}
unsigned SubLo, SubHi;
+ const TargetRegisterClass *NewDstRC;
switch (HalfSize) {
case 4:
SubLo = AMDGPU::sub0;
SubHi = AMDGPU::sub1;
+ NewDstRC = &AMDGPU::VReg_64RegClass;
break;
case 8:
SubLo = AMDGPU::sub0_sub1;
SubHi = AMDGPU::sub2_sub3;
+ NewDstRC = &AMDGPU::VReg_128RegClass;
break;
case 16:
SubLo = AMDGPU::sub0_sub1_sub2_sub3;
SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+ NewDstRC = &AMDGPU::VReg_256RegClass;
break;
case 32:
SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+ NewDstRC = &AMDGPU::VReg_512RegClass;
break;
default:
llvm_unreachable("Unhandled HalfSize");
}
- BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
- .addOperand(MI->getOperand(0))
- .addReg(RegLo)
- .addImm(SubLo)
- .addReg(RegHi)
- .addImm(SubHi);
+ unsigned OldDst = MI->getOperand(0).getReg();
+ unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
+
+ MRI.replaceRegWith(OldDst, NewDst);
+
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
+ .addReg(RegLo)
+ .addImm(SubLo)
+ .addReg(RegHi)
+ .addImm(SubHi);
}
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
+ MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const {
MachineBasicBlock *MBB = MI->getParent();
- switch (MI->getOpcode()) {
- case AMDGPU::S_LOAD_DWORD_IMM:
- case AMDGPU::S_LOAD_DWORD_SGPR:
- case AMDGPU::S_LOAD_DWORDX2_IMM:
- case AMDGPU::S_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+ int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ assert(DstIdx != -1);
+ unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
+ switch(RI.getRegClass(DstRCID)->getSize()) {
+ case 4:
+ case 8:
+ case 16: {
unsigned NewOpcode = getVALUOp(*MI);
unsigned RegOffset;
unsigned ImmOffset;
@@ -2118,53 +2388,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
.addImm(RsrcDataFormat >> 32);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
- .addReg(DWord0)
- .addImm(AMDGPU::sub0)
- .addReg(DWord1)
- .addImm(AMDGPU::sub1)
- .addReg(DWord2)
- .addImm(AMDGPU::sub2)
- .addReg(DWord3)
- .addImm(AMDGPU::sub3);
- MI->setDesc(get(NewOpcode));
- if (MI->getOperand(2).isReg()) {
- MI->getOperand(2).setReg(SRsrc);
- } else {
- MI->getOperand(2).ChangeToRegister(SRsrc, false);
- }
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
-
- const TargetRegisterClass *NewDstRC =
- RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
-
- unsigned DstReg = MI->getOperand(0).getReg();
+ .addReg(DWord0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DWord1)
+ .addImm(AMDGPU::sub1)
+ .addReg(DWord2)
+ .addImm(AMDGPU::sub2)
+ .addReg(DWord3)
+ .addImm(AMDGPU::sub3);
+
+ const MCInstrDesc &NewInstDesc = get(NewOpcode);
+ const TargetRegisterClass *NewDstRC
+ = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ unsigned DstReg = MI->getOperand(0).getReg();
MRI.replaceRegWith(DstReg, NewDstReg);
+
+ MachineInstr *NewInst =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
+ .addOperand(MI->getOperand(1)) // sbase
+ .addReg(SRsrc)
+ .addImm(0)
+ .addImm(ImmOffset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MI->eraseFromParent();
+
+ legalizeOperands(NewInst);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
break;
}
- case AMDGPU::S_LOAD_DWORDX8_IMM:
- case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+ case 32: {
MachineInstr *Lo, *Hi;
splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI);
- moveSMRDToVALU(Hi, MRI);
+ moveSMRDToVALU(Lo, MRI, Worklist);
+ moveSMRDToVALU(Hi, MRI, Worklist);
break;
}
- case AMDGPU::S_LOAD_DWORDX16_IMM:
- case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+ case 64: {
MachineInstr *Lo, *Hi;
splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI);
- moveSMRDToVALU(Hi, MRI);
+ moveSMRDToVALU(Lo, MRI, Worklist);
+ moveSMRDToVALU(Hi, MRI, Worklist);
break;
}
}
@@ -2185,51 +2457,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// Handle some special cases
switch (Opcode) {
default:
- if (isSMRD(Inst->getOpcode())) {
- moveSMRDToVALU(Inst, MRI);
+ if (isSMRD(*Inst)) {
+ moveSMRDToVALU(Inst, MRI, Worklist);
+ continue;
}
break;
- case AMDGPU::S_MOV_B64: {
- DebugLoc DL = Inst->getDebugLoc();
-
- // If the source operand is a register we can replace this with a
- // copy.
- if (Inst->getOperand(1).isReg()) {
- MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
- .addOperand(Inst->getOperand(0))
- .addOperand(Inst->getOperand(1));
- Worklist.push_back(Copy);
- } else {
- // Otherwise, we need to split this into two movs, because there is
- // no 64-bit VALU move instruction.
- unsigned Reg = Inst->getOperand(0).getReg();
- unsigned Dst = split64BitImm(Worklist,
- Inst,
- MRI,
- MRI.getRegClass(Reg),
- Inst->getOperand(1));
- MRI.replaceRegWith(Reg, Dst);
- }
- Inst->eraseFromParent();
- continue;
- }
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
Inst->eraseFromParent();
continue;
@@ -2281,6 +2530,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
break;
+ case AMDGPU::S_ABS_I32:
+ lowerScalarAbs(Worklist, Inst);
+ Inst->eraseFromParent();
+ continue;
+
case AMDGPU::S_BFE_U64:
case AMDGPU::S_BFM_B64:
llvm_unreachable("Moving this op to VALU not implemented");
@@ -2319,7 +2573,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst->addOperand(MachineOperand::CreateImm(0));
}
- addDescImplicitUseDef(NewDesc, Inst);
+ Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
@@ -2337,27 +2591,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Update the destination register class.
-
- const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-
- switch (Opcode) {
- // For target instructions, getOpRegClass just returns the virtual
- // register class associated with the operand, so we need to find an
- // equivalent VGPR register class in order to move the instruction to the
- // VALU.
- case AMDGPU::COPY:
- case AMDGPU::PHI:
- case AMDGPU::REG_SEQUENCE:
- case AMDGPU::INSERT_SUBREG:
- if (RI.hasVGPRs(NewDstRC))
- continue;
- NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
- if (!NewDstRC)
- continue;
- break;
- default:
- break;
- }
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
+ if (!NewDstRC)
+ continue;
unsigned DstReg = Inst->getOperand(0).getReg();
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
@@ -2366,13 +2602,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// Legalize the operands
legalizeOperands(Inst);
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
- E = MRI.use_end(); I != E; ++I) {
- MachineInstr &UseMI = *I->getParent();
- if (!canReadVGPR(UseMI, I.getOperandNo())) {
- Worklist.push_back(&UseMI);
- }
- }
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
}
@@ -2390,6 +2620,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
return &AMDGPU::VGPR_32RegClass;
}
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const {
+ MachineBasicBlock &MBB = *Inst->getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst->getDebugLoc();
+
+ MachineOperand &Dest = Inst->getOperand(0);
+ MachineOperand &Src = Inst->getOperand(1);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+ .addImm(0)
+ .addReg(Src.getReg());
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(Src.getReg())
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitUnaryOp(
SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst,
@@ -2414,20 +2668,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
AMDGPU::sub0, Src0SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
- MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.addOperand(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
- MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.addOperand(SrcReg0Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -2436,10 +2691,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- Worklist.push_back(LoHalf);
- Worklist.push_back(HiHalf);
+ // We don't need to legalizeOperands here because for a single operand, src0
+ // will support any kind of input.
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBinaryOp(
@@ -2474,9 +2730,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
AMDGPU::sub0, Src1SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.addOperand(SrcReg0Sub0)
.addOperand(SrcReg1Sub0);
@@ -2486,12 +2743,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub1, Src1SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.addOperand(SrcReg0Sub1)
.addOperand(SrcReg1Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -2502,8 +2759,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
// Try to legalize the operands in case we need to swap the order to keep it
// valid.
- Worklist.push_back(LoHalf);
- Worklist.push_back(HiHalf);
+ legalizeOperands(LoHalf);
+ legalizeOperands(HiHalf);
+
+ // Move all users of this moved vlaue.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2532,18 +2792,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
AMDGPU::sub1, SrcSubRC);
- MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+ BuildMI(MBB, MII, DL, InstDesc, MidReg)
.addOperand(SrcRegSub0)
.addImm(0);
- MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+ BuildMI(MBB, MII, DL, InstDesc, ResultReg)
.addOperand(SrcRegSub1)
.addReg(MidReg);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
- Worklist.push_back(First);
- Worklist.push_back(Second);
+ // We don't need to legalize operands here. src0 for etiher instruction can be
+ // an SGPR, and the second input is unused or determined here.
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2587,6 +2848,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
.addImm(AMDGPU::sub1);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return;
}
@@ -2605,33 +2867,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
.addImm(AMDGPU::sub1);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
- MachineInstr *Inst) const {
- // Add the implict and explicit register definitions.
- if (NewDesc.ImplicitUses) {
- for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitUses[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+ unsigned DstReg,
+ MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const {
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+ E = MRI.use_end(); I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ if (!canReadVGPR(UseMI, I.getOperandNo())) {
+ Worklist.push_back(&UseMI);
}
}
+}
- if (NewDesc.ImplicitDefs) {
- for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitDefs[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
- }
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+ const MachineInstr &Inst) const {
+ const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+ switch (Inst.getOpcode()) {
+ // For target instructions, getOpRegClass just returns the virtual register
+ // class associated with the operand, so we need to find an equivalent VGPR
+ // register class in order to move the instruction to the VALU.
+ case AMDGPU::COPY:
+ case AMDGPU::PHI:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ if (RI.hasVGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ return NewDstRC;
+ default:
+ return NewDstRC;
}
}
+// Find the one SGPR operand we are allowed to use.
unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
int OpIndices[3]) const {
- const MCInstrDesc &Desc = get(MI->getOpcode());
+ const MCInstrDesc &Desc = MI->getDesc();
// Find the one SGPR operand we are allowed to use.
- unsigned SGPRReg = AMDGPU::NoRegister;
-
+ //
// First we need to consider the instruction's operand requirements before
// legalizing. Some operands are required to be SGPRs, such as implicit uses
// of VCC, but we are still bound by the constant bus requirement to only use
@@ -2639,17 +2921,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
//
// If the operand's class is an SGPR, we can never move it.
- for (const MachineOperand &MO : MI->implicit_operands()) {
- // We only care about reads.
- if (MO.isDef())
- continue;
-
- if (MO.getReg() == AMDGPU::VCC)
- return AMDGPU::VCC;
-
- if (MO.getReg() == AMDGPU::FLAT_SCR)
- return AMDGPU::FLAT_SCR;
- }
+ unsigned SGPRReg = findImplicitSGPRRead(*MI);
+ if (SGPRReg != AMDGPU::NoRegister)
+ return SGPRReg;
unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -2660,15 +2934,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
break;
const MachineOperand &MO = MI->getOperand(Idx);
- if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
- SGPRReg = MO.getReg();
+ if (!MO.isReg())
+ continue;
- if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
- UsedSGPRs[i] = MO.getReg();
- }
+ // Is this operand statically required to be an SGPR based on the operand
+ // constraints?
+ const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+ bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+ if (IsRequiredSGPR)
+ return MO.getReg();
- if (SGPRReg != AMDGPU::NoRegister)
- return SGPRReg;
+ // If this could be a VGPR or an SGPR, Check the dynamic register class.
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+ if (RI.isSGPRClass(RegRC))
+ UsedSGPRs[i] = Reg;
+ }
// We don't have a required SGPR operand, so we have a bit more freedom in
// selecting operands to move.
@@ -2680,6 +2961,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
// V_FMA_F32 v0, s0, s0, s0 -> No moves
// V_FMA_F32 v0, s0, s1, s0 -> Move s1
+ // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+ // prefer those.
+
if (UsedSGPRs[0] != AMDGPU::NoRegister) {
if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
SGPRReg = UsedSGPRs[0];
@@ -2720,7 +3004,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
- return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+ return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
.addOperand(I->getOperand(0))
.addOperand(I->getOperand(1))
.addReg(IndirectBaseReg)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 5053786a39f5..307ef67ed263 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -39,14 +39,11 @@ private:
unsigned SubIdx,
const TargetRegisterClass *SubRC) const;
- unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
- MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineOperand &Op) const;
-
void swapOperands(MachineBasicBlock::iterator Inst) const;
+ void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const;
+
void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst, unsigned Opcode) const;
@@ -58,13 +55,24 @@ private:
void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst) const;
- void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+ void addUsersToMoveToVALUWorklist(
+ unsigned Reg, MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+ const TargetRegisterClass *
+ getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
MachineInstr *MIb) const;
unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+protected:
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx0,
+ unsigned OpIdx1) const override;
+
public:
explicit SIInstrInfo(const AMDGPUSubtarget &st);
@@ -117,17 +125,14 @@ public:
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+ LLVM_READONLY
int commuteOpcode(const MachineInstr &MI) const;
- MachineInstr *commuteInstruction(MachineInstr *MI,
- bool NewMI = false) const override;
bool findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
- bool isTriviallyReMaterializable(const MachineInstr *MI,
- AliasAnalysis *AA = nullptr) const;
-
bool areMemAccessesTriviallyDisjoint(
MachineInstr *MIa, MachineInstr *MIb,
AliasAnalysis *AA = nullptr) const override;
@@ -137,8 +142,6 @@ public:
unsigned DstReg, unsigned SrcReg) const override;
bool isMov(unsigned Opcode) const override;
- bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
unsigned Reg, MachineRegisterInfo *MRI) const final;
@@ -148,78 +151,154 @@ public:
MachineBasicBlock::iterator &MI,
LiveVariables *LV) const override;
+ static bool isSALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+ }
+
bool isSALU(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SALU;
}
+ static bool isVALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+ }
+
bool isVALU(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VALU;
}
+ static bool isSOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+ }
+
bool isSOP1(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP1;
}
+ static bool isSOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+ }
+
bool isSOP2(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP2;
}
+ static bool isSOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+ }
+
bool isSOPC(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPC;
}
+ static bool isSOPK(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+ }
+
bool isSOPK(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPK;
}
+ static bool isSOPP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+ }
+
bool isSOPP(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPP;
}
+ static bool isVOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+ }
+
bool isVOP1(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP1;
}
+ static bool isVOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+ }
+
bool isVOP2(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP2;
}
+ static bool isVOP3(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+ }
+
bool isVOP3(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP3;
}
+ static bool isVOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+ }
+
bool isVOPC(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOPC;
}
+ static bool isMUBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+ }
+
bool isMUBUF(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
}
+ static bool isMTBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+ }
+
bool isMTBUF(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
}
+ static bool isSMRD(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+ }
+
bool isSMRD(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
+ static bool isDS(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DS;
+ }
+
bool isDS(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
+ static bool isMIMG(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+ }
+
bool isMIMG(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MIMG;
}
+ static bool isFLAT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+ }
+
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ static bool isWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+ }
+
bool isWQM(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
+ static bool isVGPRSpill(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+ }
+
bool isVGPRSpill(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
}
@@ -302,6 +381,26 @@ public:
bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO = nullptr) const;
+ /// \brief Check if \p MO would be a valid operand for the given operand
+ /// definition \p OpInfo. Note this does not attempt to validate constant bus
+ /// restrictions (e.g. literal constant usage).
+ bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Check if \p MO (a register operand) is a legal register for the
+ /// given operand description.
+ bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Legalize operands in \p MI by either commuting it or inserting a
+ /// copy of src1.
+ void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
+ /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+ void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
/// \brief Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr *MI) const;
@@ -312,7 +411,8 @@ public:
unsigned HalfImmOp, unsigned HalfSGPROp,
MachineInstr *&Lo, MachineInstr *&Hi) const;
- void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+ void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
/// \brief Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
@@ -341,29 +441,49 @@ public:
void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
unsigned SavReg, unsigned IndexReg) const;
- void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+ void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
/// \brief Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
+ LLVM_READONLY
MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+ LLVM_READONLY
const MachineOperand *getNamedOperand(const MachineInstr &MI,
unsigned OpName) const {
return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
}
+ /// Get required immediate operand
+ int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+ return MI.getOperand(Idx).getImm();
+ }
+
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
};
namespace AMDGPU {
-
+ LLVM_READONLY
int getVOPe64(uint16_t Opcode);
+
+ LLVM_READONLY
int getVOPe32(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteRev(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteOrig(uint16_t Opcode);
+
+ LLVM_READONLY
int getAddr64Inst(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicRetOp(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 8d8110bca4c5..10f2adde4867 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -8,9 +8,9 @@
//===----------------------------------------------------------------------===//
def isCI : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGCN3Encoding">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+ "AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate <"FeatureSeaIslands">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
@@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> {
field bits<5> VI = vi;
}
+// Specify an SMRD opcode for SI and SMEM opcode for VI
+
+// FIXME: This should really be bits<5> si, Tablegen crashes if
+// parameter default value is other parameter with different bit size
+class smrd<bits<8> si, bits<8> vi = si> {
+ field bits<5> SI = si{4-0};
+ field bits<8> VI = vi;
+}
+
// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
// in AMDGPUInstrInfo.cpp
def SISubtarget {
@@ -121,9 +130,20 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
def SIconstdata_ptr : SDNode<
- "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+ "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+ SDTCisVT<0, i64>]>
>;
+def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ return isGlobalLoad(cast<LoadSDNode>(N)) ||
+ isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ return isConstantLoad(cast<LoadSDNode>(N), -1) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+}]>;
+
//===----------------------------------------------------------------------===//
// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
// to be glued to the memory instructions.
@@ -328,9 +348,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
U != E; ++U) {
- if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+ if (RC && SIRI->isSGPRClass(RC))
return true;
- }
}
return false;
}]>;
@@ -354,6 +374,8 @@ def sopp_brtarget : Operand<OtherVT> {
let ParserMatchClass = SoppBrTarget;
}
+def const_ga : Operand<iPTR>;
+
include "SIInstrFormats.td"
include "VIInstrFormats.td"
@@ -393,7 +415,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
class GLCBaseMatchClass <string parser> : AsmOperandClass {
let Name = "GLC"#parser;
let PredicateMethod = "isImm";
- let ParserMethod = parser;
+ let ParserMethod = parser;
let RenderMethod = "addImmOperands";
}
@@ -436,6 +458,17 @@ def ClampMatchClass : AsmOperandClass {
let RenderMethod = "addImmOperands";
}
+class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
+ let Name = "SMRDOffset"#predicate;
+ let PredicateMethod = predicate;
+ let RenderMethod = "addImmOperands";
+}
+
+def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
+def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
+ "isSMRDLiteralOffset"
+>;
+
let OperandType = "OPERAND_IMMEDIATE" in {
def offen : Operand<i1> {
@@ -510,6 +543,16 @@ def ClampMod : Operand <i1> {
let ParserMatchClass = ClampMatchClass;
}
+def smrd_offset : Operand <i32> {
+ let PrintMethod = "printU32ImmOperand";
+ let ParserMatchClass = SMRDOffsetMatchClass;
+}
+
+def smrd_literal_offset : Operand <i32> {
+ let PrintMethod = "printU32ImmOperand";
+ let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+}
+
} // End OperandType = "OPERAND_IMMEDIATE"
def VOPDstS64 : VOPDstOperand <SReg_64>;
@@ -528,6 +571,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -717,19 +767,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
- def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
-
- def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
- opName#" $dst, $src0, $src1 [$scc]">;
-
- def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
- opName#" $dst, $src0, $src1 [$scc]">;
-}
-
multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
list<dag> pattern> {
@@ -758,8 +795,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
string opName, PatLeaf cond> : SOPC <
- op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
- opName#" $src0, $src1", []>;
+ op, (outs), (ins rc:$src0, rc:$src1),
+ opName#" $src0, $src1", []> {
+ let Defs = [SCC];
+}
class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
: SOPC_Helper<op, SSrc_32, i32, opName, cond>;
@@ -812,15 +851,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
}
multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
- def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
- (ins SReg_32:$src0, u16imm:$src1), pattern>;
+ def "" : SOPK_Pseudo <opName, (outs),
+ (ins SReg_32:$src0, u16imm:$src1), pattern> {
+ let Defs = [SCC];
+ }
+
- let DisableEncoding = "$dst" in {
- def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+ def _si : SOPK_Real_si <op, opName, (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+ let Defs = [SCC];
+ }
- def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+ def _vi : SOPK_Real_vi <op, opName, (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+ let Defs = [SCC];
}
}
@@ -868,35 +912,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
}
class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
- string asm> :
- SMRD <outs, ins, asm, []>,
+ string asm, list<dag> pattern = []> :
+ SMRD <outs, ins, asm, pattern>,
SMEMe_vi <op, imm>,
SIMCInstr<opName, SISubtarget.VI> {
let AssemblerPredicates = [isVI];
}
-multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
string asm, list<dag> pattern> {
def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
- def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+ def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
// glc is only applicable to scalar stores, which are not yet
// implemented.
let glc = 0 in {
- def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+ def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
}
}
-multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
+multiclass SMRD_Inval <smrd op, string opName,
+ SDPatternOperator node> {
+ let hasSideEffects = 1, mayStore = 1 in {
+ def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+
+ let sbase = 0, offset = 0 in {
+ let sdst = 0 in {
+ def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
+ }
+
+ let glc = 0, sdata = 0 in {
+ def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+ }
+ }
+ }
+}
+
+class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
+ SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+ let hasSideEffects = 1;
+ let mayStore = 1;
+ let sbase = 0;
+ let sdata = 0;
+ let glc = 0;
+ let offset = 0;
+}
+
+multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
RegisterClass dstClass> {
defm _IMM : SMRD_m <
op, opName#"_IMM", 1, (outs dstClass:$dst),
- (ins baseClass:$sbase, u32imm:$offset),
+ (ins baseClass:$sbase, smrd_offset:$offset),
opName#" $dst, $sbase, $offset", []
>;
+ def _IMM_ci : SMRD <
+ (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+ opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+ let AssemblerPredicates = [isCIOnly];
+ }
+
defm _SGPR : SMRD_m <
op, opName#"_SGPR", 0, (outs dstClass:$dst),
(ins baseClass:$sbase, SReg_32:$soff),
@@ -922,11 +999,12 @@ def InputModsNoDefault : Operand <i32> {
let ParserMatchClass = InputModsMatchClass;
}
-class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
int ret =
- !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
+ !if (!eq(Src0.Value, untyped.Value), 0,
+ !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
!if (!eq(Src2.Value, untyped.Value), 2, // VOP2
- 3)); // VOP3
+ 3))); // VOP3
}
// Returns the register class to use for the destination of VOP[123C]
@@ -934,28 +1012,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
class getVALUDstForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
- VOPDstOperand<SReg_64>)); // else VT == i1
+ !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+ VOPDstOperand<SReg_64>))); // else VT == i1
}
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+ RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
}
// Returns the register class to use for source 1 of VOP[12C] for the
// given VT.
class getVOPSrc1ForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+ RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+ RegisterOperand ret =
+ !if(!eq(VT.Size, 64),
+ VCSrc_64,
+ !if(!eq(VT.Value, i1.Value),
+ SCSrc_64,
+ VCSrc_32
+ )
+ );
}
// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
class hasModifiers<ValueType SrcVT> {
bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
!if(!eq(SrcVT.Value, f64.Value), 1, 0));
@@ -1009,17 +1096,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
// Returns the assembly string for the inputs and outputs of a VOP[12C]
// instruction. This does not add the _e32 suffix, so it can be reused
// by getAsm64.
-class getAsm32 <int NumSrcArgs> {
+class getAsm32 <bit HasDst, int NumSrcArgs> {
+ string dst = "$dst";
+ string src0 = ", $src0";
string src1 = ", $src1";
string src2 = ", $src2";
- string ret = "$dst, $src0"#
- !if(!eq(NumSrcArgs, 1), "", src1)#
- !if(!eq(NumSrcArgs, 3), src2, "");
+ string ret = !if(HasDst, dst, "") #
+ !if(!eq(NumSrcArgs, 1), src0, "") #
+ !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+ !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
}
// Returns the assembly string for the inputs and outputs of a VOP3
// instruction.
-class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1027,11 +1117,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> {
string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
string ret =
!if(!eq(HasModifiers, 0),
- getAsm32<NumSrcArgs>.ret,
+ getAsm32<HasDst, NumSrcArgs>.ret,
"$dst, "#src0#src1#src2#"$clamp"#"$omod");
}
-
class VOPProfile <list<ValueType> _ArgVT> {
field list<ValueType> ArgVT = _ArgVT;
@@ -1047,29 +1136,38 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
- field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+ field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+ field bit HasDst32 = HasDst;
+ field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
field bit HasModifiers = hasModifiers<Src0VT>.ret;
- field dag Outs = (outs DstRC:$dst);
+ field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+
+ // VOP3b instructions are a special case with a second explicit
+ // output. This is manually overridden for them.
+ field dag Outs32 = Outs;
+ field dag Outs64 = Outs;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasModifiers>.ret;
- field string Asm32 = getAsm32<NumSrcArgs>.ret;
- field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+ field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
+ field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
}
// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
// for the instruction patterns to work.
-def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
-def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
@@ -1087,25 +1185,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+ let Asm32 = "$dst, vcc, $src0, $src1";
+ let Asm64 = "$dst, $sdst, $src0, $src1";
+ let Outs32 = (outs DstRC:$dst);
+ let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ // We use VCSrc_32 to exclude literal constants, even though the
+ // encoding normally allows them since the implicit VCC use means
+ // using one would always violate the constant bus
+ // restriction. SGPRs are still allowed because it should
+ // technically be possible to use VCC again as src0.
let Src0RC32 = VCSrc_32;
+ let Asm32 = "$dst, vcc, $src0, $src1, vcc";
+ let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
+ let Outs32 = (outs DstRC:$dst);
+ let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
}
-def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
- let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- let Asm64 = "$dst, $src0_modifiers, $src1";
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
}
-def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VReg_64>;
+}
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> {
+ let Asm32 = "vcc, $src0, $src1";
+ // The destination for 32-bit encoding is implicit.
+ let HasDst32 = 0;
+}
+
+class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$dst, $src0_modifiers, $src1";
}
+def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<f64>;
+def VOPC_I1_I32_I32 : VOPC_Profile<i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<i64>;
+
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
+
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
let Asm64 = "$dst, $src0, $src1, $src2";
}
@@ -1119,13 +1268,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
HasModifiers>.ret;
- let Asm32 = getAsm32<2>.ret;
- let Asm64 = getAsm64<2, HasModifiers>.ret;
+ let Asm32 = getAsm32<1, 2>.ret;
+ let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
}
def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
+ InstAlias <asm, (inst)>, PredicateControl {
+
+ field bit isCompare;
+ field bit isCommutable;
+
+ let ResultInst =
+ !if (p.HasDst32,
+ !if (!eq(p.NumSrcArgs, 0),
+ // 1 dst, 0 src
+ (inst p.DstRC:$dst),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 1 dst, 1 src
+ (inst p.DstRC:$dst, p.Src0RC32:$src0),
+ !if (!eq(p.NumSrcArgs, 2),
+ // 1 dst, 2 src
+ (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+ // else - unreachable
+ (inst)))),
+ // else
+ !if (!eq(p.NumSrcArgs, 2),
+ // 0 dst, 2 src
+ (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 0 dst, 1 src
+ (inst p.Src0RC32:$src1),
+ // else
+ // 0 dst, 0 src
+ (inst))));
+}
+
+class SIInstAliasSI <string asm, string op_name, VOPProfile p> :
+ SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> {
+ let AssemblerPredicate = SIAssemblerPredicate;
+}
+
+class SIInstAliasVI <string asm, string op_name, VOPProfile p> :
+ SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> {
+ let AssemblerPredicates = [isVI];
+}
+
+multiclass SIInstAliasBuilder <string asm, VOPProfile p> {
+
+ def : SIInstAliasSI <asm, NAME, p>;
+
+ def : SIInstAliasVI <asm, NAME, p>;
+}
class VOP <string opName> {
string OpName = opName;
@@ -1165,20 +1361,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName> {
- def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+ string asm = opName#p.Asm32> {
+ def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
- def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
+
+ def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>;
- def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
}
-multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName> {
- def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+ string asm = opName#p.Asm32> {
+
+ def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
- def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
}
class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
@@ -1202,22 +1400,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, string revOp> {
- def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
+ string revOp> {
+
+ def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
}
-multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, string revOp> {
- def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
+ string revOp> {
+
+ def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
- def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
+ def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>;
}
@@ -1250,6 +1450,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
MnemonicAlias<opName#"_e64", opName> {
let isPseudo = 1;
let isCodeGenOnly = 1;
+
+ field bit vdst;
+ field bit src0;
}
class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
@@ -1295,22 +1498,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
HasMods>;
}
-// VOP3_m without source modifiers
-multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, int NumSrcArgs, bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
- let src0_modifiers = 0,
- src1_modifiers = 0,
- src2_modifiers = 0,
- clamp = 0,
- omod = 0 in {
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
- def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
- }
-}
-
multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, bit HasMods = 1> {
@@ -1335,7 +1522,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+ bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1349,7 +1536,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+ bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1360,54 +1547,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
// No VI instruction. This class is for SI only.
}
-// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
-// option of implicit vcc use?
-multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
- def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
- // The VOP2 variant puts the carry out into VCC, the VOP3 variant
- // can write it into any SGPR. We currently don't use the carry out,
- // so for now hardcode it to VCC as well.
- let sdst = SIOperand.VCC, Defs = [VCC] in {
- def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 0, HasMods>;
-
- def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 0, HasMods>;
- } // End sdst = SIOperand.VCC, Defs = [VCC]
-}
-
-multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+// Two operand VOP3b instruction that may have a 3rd SGPR bool operand
+// instead of an implicit VCC as in the VOP2b format.
+multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOp,
+ bit HasMods = 1, bit useSrc2Input = 0> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 1, HasMods>;
+ VOP3DisableFields<1, useSrc2Input, HasMods>;
def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 1, HasMods>;
+ VOP3DisableFields<1, useSrc2Input, HasMods>;
}
multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName,
- bit HasMods, bit defExec, string revOp> {
+ bit HasMods, bit defExec,
+ string revOp, list<SchedReadWrite> sched> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+ VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
+ }
def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
}
def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
}
}
@@ -1432,32 +1606,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
}
}
-multiclass VOP1_Helper <vop1 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- bit HasMods> {
+multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
+ list<dag> pat64> {
- defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+ defm _e32 : VOP1_m <op, opName, p, pat32>;
- defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
+ defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ p.HasModifiers>;
}
multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> : VOP1_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
- P.HasModifiers
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
>;
multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
- defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+ defm _e32 : VOP1SI_m <op, opName, P, []>;
defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
!if(P.HasModifiers,
@@ -1467,36 +1637,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
opName, P.HasModifiers>;
}
-multiclass VOP2_Helper <vop2 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
- defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
+ list<dag> pat64, string revOp> {
- defm _e64 : VOP3_2_m <op,
- outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
- >;
+ defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+
+ defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ revOp, p.HasModifiers>;
}
multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> : VOP2_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp
>;
multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> {
- defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+ defm _e32 : VOP2SI_m <op, opName, P, [], revOp>;
defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
!if(P.HasModifiers,
@@ -1508,58 +1675,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
opName, revOp, P.HasModifiers>;
}
-multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
+multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
+ list<dag> pat32, list<dag> pat64,
+ string revOp, bit useSGPRInput> {
- defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+ let SchedRW = [Write32Bit, WriteSALU] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+ defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+ }
- defm _e64 : VOP3b_2_m <op,
- outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
- >;
+ defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+ opName, revOp, p.HasModifiers, useSGPRInput>;
+ }
}
multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> : VOP2b_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp, !eq(P.NumSrcArgs, 3)
>;
// A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
- defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p,
+ list<dag> pat32, list<dag> pat64, string revOp> {
- defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
- revOp, HasMods>;
+ defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>;
+
+ defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ revOp, p.HasModifiers>;
}
multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName>
: VOP2_VI3_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp
>;
multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
@@ -1583,64 +1747,75 @@ let isCodeGenOnly = 0 in {
} // End isCodeGenOnly = 0
}
-class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
VOPCCommon <ins, "", pattern>,
VOP <opName>,
- SIMCInstr<opName#"_e32", SISubtarget.NONE>,
- MnemonicAlias<opName#"_e32", opName> {
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
-multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, bit DefExec, string revOpName = ""> {
- def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
-
- def _si : VOPC<op.SI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SISubtarget.SI> {
- let Defs = !if(DefExec, [EXEC], []);
- let hasSideEffects = DefExec;
- let AssemblerPredicates = [isSICI];
+multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
+ string opName, bit DefExec, VOPProfile p,
+ list<SchedReadWrite> sched,
+ string revOpName = "", string asm = opName#"_e32 "#op_asm,
+ string alias_asm = opName#" "#op_asm> {
+ def "" : VOPC_Pseudo <ins, pattern, opName> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = sched;
}
- def _vi : VOPC<op.VI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SISubtarget.VI> {
- let Defs = !if(DefExec, [EXEC], []);
- let hasSideEffects = DefExec;
- let AssemblerPredicates = [isVI];
- }
+ let AssemblerPredicates = [isSICI] in {
+ def _si : VOPC<op.SI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let hasSideEffects = DefExec;
+ let SchedRW = sched;
+ }
+
+ } // End AssemblerPredicates = [isSICI]
+
+ let AssemblerPredicates = [isVI] in {
+ def _vi : VOPC<op.VI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let hasSideEffects = DefExec;
+ let SchedRW = sched;
+ }
+
+ } // End AssemblerPredicates = [isVI]
+
+ defm : SIInstAliasBuilder<alias_asm, p>;
}
-multiclass VOPC_Helper <vopc op, string opName,
- dag ins32, string asm32, list<dag> pat32,
- dag out64, dag ins64, string asm64, list<dag> pat64,
- bit HasMods, bit DefExec, string revOp> {
- defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
+ list<dag> pat64, bit DefExec, string revOp,
+ VOPProfile p, list<SchedReadWrite> sched> {
+ defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
- defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
- opName, HasMods, DefExec, revOp>;
+ defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+ opName, p.HasModifiers, DefExec, revOp, sched>;
}
// Special case for class instructions which only have modifiers on
// the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName,
- dag ins32, string asm32, list<dag> pat32,
- dag out64, dag ins64, string asm64, list<dag> pat64,
- bit HasMods, bit DefExec, string revOp> {
- defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
- defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
- opName, HasMods, DefExec, revOp>,
+multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
+ list<dag> pat64, bit DefExec, string revOp,
+ VOPProfile p, list<SchedReadWrite> sched> {
+ defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+
+ defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+ opName, p.HasModifiers, DefExec, revOp, sched>,
VOP3DisableModFields<1, 0, 0>;
}
multiclass VOPCInst <vopc op, string opName,
VOPProfile P, PatLeaf cond = COND_NULL,
string revOp = opName,
- bit DefExec = 0> : VOPC_Helper <
- op, opName,
- P.Ins32, P.Asm32, [],
- (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+ bit DefExec = 0,
+ list<SchedReadWrite> sched = [Write32Bit]> :
+ VOPC_Helper <
+ op, opName, [],
!if(P.HasModifiers,
[(set i1:$dst,
(setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
@@ -1648,51 +1823,51 @@ multiclass VOPCInst <vopc op, string opName,
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
cond))],
[(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
- P.HasModifiers, DefExec, revOp
+ DefExec, revOp, P, sched
>;
multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
- bit DefExec = 0> : VOPC_Class_Helper <
- op, opName,
- P.Ins32, P.Asm32, [],
- (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+ bit DefExec = 0,
+ list<SchedReadWrite> sched> : VOPC_Class_Helper <
+ op, opName, [],
!if(P.HasModifiers,
[(set i1:$dst,
(AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
[(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
- P.HasModifiers, DefExec, opName
+ DefExec, opName, P, sched
>;
multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>;
multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>;
multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>;
multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>;
multiclass VOPCX <vopc op, string opName, VOPProfile P,
PatLeaf cond = COND_NULL,
+ list<SchedReadWrite> sched,
string revOp = "">
- : VOPCInst <op, opName, P, cond, revOp, 1>;
+ : VOPCInst <op, opName, P, cond, revOp, 1, sched>;
multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>;
multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>;
multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>;
multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
@@ -1700,16 +1875,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
>;
multiclass VOPC_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+ VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+ VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>;
multiclass VOPC_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+ VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>;
multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+ VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> : VOP3_Helper <
@@ -1761,25 +1936,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
3, 1
>;
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
- string opName, list<dag> pattern> :
- VOP3b_3_m <
- op, (outs vrc:$vdst, SReg_64:$sdst),
- (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
- InputModsNoDefault:$src1_modifiers, arc:$src1,
- InputModsNoDefault:$src2_modifiers, arc:$src2,
- ClampMod:$clamp, omod:$omod),
- opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
- opName, opName, 1, 1
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+ VOP3b_2_3_m <
+ op, P.Outs64, P.Ins64,
+ opName#" "#P.Asm64, pattern,
+ opName, "", 1, 1
>;
-multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
-
-
class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1925,12 +2088,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
+ let hasPostISelHook = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, []>,
+ AtomicNoRet<noRetOp, 1>;
- let data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ let data1 = 0 in {
+ def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ }
}
}
@@ -1939,11 +2104,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs rc:$vdst),
string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
+ let hasPostISelHook = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, []>,
+ AtomicNoRet<noRetOp, 1>;
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ }
}
multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
@@ -2214,7 +2381,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
defm _ADDR64 : MUBUFAtomicAddr64_m <
op, name#"_addr64", (outs),
- (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+ (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
>;
@@ -2233,7 +2400,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
op, name#"_rtn_addr64", (outs rc:$vdata),
- (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+ (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
[(set vt:$vdata,
@@ -2245,7 +2412,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
op, name#"_rtn_offset", (outs rc:$vdata),
(ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
mbuf_offset:$offset, slc:$slc),
- name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+ name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
[(set vt:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
i1:$slc), vt:$vdata_in))], 1
@@ -2256,6 +2423,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
} // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
}
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
@@ -2368,47 +2537,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
} // End mayLoad = 0, mayStore = 1
}
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
- FLAT <op, (outs regClass:$vdst),
- (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
- let data = 0;
- let mayLoad = 1;
+// For cache invalidation instructions.
+multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> {
+ let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in {
+ def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>;
+
+ // Set everything to 0.
+ let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0,
+ vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in {
+ let addr64 = 0 in {
+ def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>;
+ }
+
+ def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>;
+ }
+ } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
}
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
- FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
- glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
- []> {
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class flat <bits<7> ci, bits<7> vi = ci> {
+ field bits<7> CI = ci;
+ field bits<7> VI = vi;
+}
- let mayLoad = 0;
- let mayStore = 1;
+class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ FLAT <0, outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
- // Encoding
- let vdst = 0;
+class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.SI> {
+ let AssemblerPredicate = isCIOnly;
}
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
- RegisterClass data_rc = vdst_rc> {
+class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.VI> {
+ let AssemblerPredicate = VIAssemblerPredicate;
+}
- let mayLoad = 1, mayStore = 1 in {
- def "" : FLAT <op, (outs),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $addr, $data"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 0> {
- let glc = 0;
- let vdst = 0;
- }
+multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+ def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
+ AtomicNoRet <NAME, 1>;
- def _RTN : FLAT <op, (outs vdst_rc:$vdst),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 1> {
- let glc = 1;
- }
+ def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
+}
+
+multiclass FLAT_Load_Helper <flat op, string asm_name,
+ RegisterClass regClass,
+ dag outs = (outs regClass:$vdst),
+ dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let data = 0, mayLoad = 1 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_Store_Helper <flat op, string asm_name,
+ RegisterClass vdataClass,
+ dag outs = (outs),
+ dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
+ slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let mayLoad = 0, mayStore = 1, vdst = 0 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+ RegisterClass data_rc = vdst_rc,
+ dag outs_noret = (outs),
+ string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
+
+ let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
+ def "" : FLAT_Pseudo <NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+ AtomicNoRet <NAME, 0>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+ }
+
+ let glc = 1, hasPostISelHook = 1 in {
+ defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
+ (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+ tfe_flat_atomic:$tfe),
+ asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
}
}
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index e0eeea9034b3..6f653c70aca0 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
- "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+ "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
@@ -62,36 +64,38 @@ let mayLoad = 1 in {
// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
// SMRD instructions, because the SGPR_32 register class does not include M0
// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
- 0x08, "s_buffer_load_dword", SReg_128, SGPR_32
+ smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
>;
defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
- 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
+ smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64
>;
defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
- 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
+ smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128
>;
defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
- 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
+ smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256
>;
defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
- 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
+ smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
>;
} // mayLoad = 1
//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
+
+defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
+ int_amdgcn_s_dcache_inv>;
//===----------------------------------------------------------------------===//
// SOP1 Instructions
@@ -123,7 +127,7 @@ let Defs = [SCC] in {
defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
- [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+ [(set i32:$dst, (bitreverse i32:$src0))]
>;
defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
@@ -183,10 +187,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>
defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+
+let Uses = [M0] in {
defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+} // End Uses = [M0]
+
defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
let Defs = [SCC] in {
@@ -354,7 +362,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
// SOPK Instructions
//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isMoveImm = 1 in {
defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
} // End isReMaterializable = 1
let Uses = [SCC] in {
@@ -438,36 +446,38 @@ def S_BRANCH : SOPP <
let isBarrier = 1;
}
-let DisableEncoding = "$scc" in {
+let Uses = [SCC] in {
def S_CBRANCH_SCC0 : SOPP <
- 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+ 0x00000004, (ins sopp_brtarget:$simm16),
"s_cbranch_scc0 $simm16"
>;
def S_CBRANCH_SCC1 : SOPP <
- 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+ 0x00000005, (ins sopp_brtarget:$simm16),
"s_cbranch_scc1 $simm16"
>;
-} // End DisableEncoding = "$scc"
+} // End Uses = [SCC]
+let Uses = [VCC] in {
def S_CBRANCH_VCCZ : SOPP <
- 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+ 0x00000006, (ins sopp_brtarget:$simm16),
"s_cbranch_vccz $simm16"
>;
def S_CBRANCH_VCCNZ : SOPP <
- 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+ 0x00000007, (ins sopp_brtarget:$simm16),
"s_cbranch_vccnz $simm16"
>;
+} // End Uses = [VCC]
-let DisableEncoding = "$exec" in {
+let Uses = [EXEC] in {
def S_CBRANCH_EXECZ : SOPP <
- 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+ 0x00000008, (ins sopp_brtarget:$simm16),
"s_cbranch_execz $simm16"
>;
def S_CBRANCH_EXECNZ : SOPP <
- 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+ 0x00000009, (ins sopp_brtarget:$simm16),
"s_cbranch_execnz $simm16"
>;
-} // End DisableEncoding = "$exec"
+} // End Uses = [EXEC]
} // End isBranch = 1
@@ -477,11 +487,11 @@ let hasSideEffects = 1 in {
def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_AMDGPU_barrier_local)]
> {
+ let SchedRW = [WriteBarrier];
let simm16 = 0;
- let isBarrier = 1;
- let hasCtrlDep = 1;
let mayLoad = 1;
let mayStore = 1;
+ let isConvergent = 1;
}
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
@@ -805,9 +815,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-let SubtargetPredicate = isCI in {
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-} // End isCI
defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
let mayStore = 0 in {
defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
@@ -905,11 +912,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
-//let SubtargetPredicate = isCI in {
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-//} // End isCI
-
//===----------------------------------------------------------------------===//
// MUBUF Instructions
//===----------------------------------------------------------------------===//
@@ -951,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
>;
defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
- mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
+ mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
- mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+ mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
- mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+ mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
>;
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
@@ -1034,9 +1036,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
-//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
+
+let SubtargetPredicate = isSI in {
+defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
+}
+
+defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>;
//===----------------------------------------------------------------------===//
// MTBUF Instructions
@@ -1155,8 +1160,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
// VOP1 Instructions
//===----------------------------------------------------------------------===//
-let vdst = 0, src0 = 0 in {
-defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>;
}
let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -1292,7 +1297,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
VOP_F64_F64, fsqrt
>;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
VOP_F32_F32, AMDGPUsin
@@ -1300,6 +1307,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
VOP_F32_F32, AMDGPUcos
>;
+
+} // End SchedRW = [WriteQuarterRate32]
+
defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
@@ -1308,24 +1318,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
VOP_I32_F64
>;
+
+let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
VOP_F64_F64
>;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
+ VOP_F64_F64
+>;
+} // End SchedRW = [WriteDoubleAdd]
+
+
defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
VOP_I32_F32
>;
defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
VOP_F32_F32
>;
-let vdst = 0, src0 = 0 in {
-defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
- "v_clrexcp"
->;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
}
+
+let Uses = [M0, EXEC] in {
defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+} // End Uses = [M0, EXEC]
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -1343,7 +1362,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
VOP_F32_F32, AMDGPUrsq_legacy
>;
-} // End let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
@@ -1360,7 +1379,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
// VINTRP Instructions
//===----------------------------------------------------------------------===//
-let Uses = [M0] in {
+let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
@@ -1405,16 +1424,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
[(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
(i32 imm:$attr)))]>;
-} // End Uses = [M0]
+} // End Uses = [M0, EXEC]
//===----------------------------------------------------------------------===//
// VOP2 Instructions
//===----------------------------------------------------------------------===//
multiclass V_CNDMASK <vop2 op, string name> {
- defm _e32 : VOP2_m <
- op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
- name, name>;
+ defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
defm _e64 : VOP3_m <
op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
@@ -1500,34 +1517,32 @@ let isCommutable = 1 in {
defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
} // End isCommutable = 1
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+let isCommutable = 1 in {
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
// but the VI instructions behave the same as the SI versions.
defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
- VOP_I32_I32_I32, add
+ VOP2b_I32_I1_I32_I32
>;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>;
defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
- VOP_I32_I32_I32, null_frag, "v_sub_i32"
+ VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32"
>;
-let Uses = [VCC] in { // Carry-in comes from VCC
defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
- VOP_I32_I32_I32_VCC
+ VOP2b_I32_I1_I32_I32_I1
>;
defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
- VOP_I32_I32_I32_VCC
+ VOP2b_I32_I1_I32_I32_I1
>;
defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
- VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+ VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32"
>;
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
+} // End isCommutable = 1
defm V_READLANE_B32 : VOP2SI_3VI_m <
vop3 <0x001, 0x289>,
@@ -1575,10 +1590,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
VOP_I32_I32_I32
>;
defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
- VOP_I32_I32_I32
+ VOP_I32_I32_I32, int_amdgcn_mbcnt_lo
>;
defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
- VOP_I32_I32_I32
+ VOP_I32_I32_I32, int_amdgcn_mbcnt_hi
>;
defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
VOP_F32_F32_I32, AMDGPUldexp
@@ -1704,15 +1719,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <
vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
defm V_DIV_FIXUP_F64 : VOP3Inst <
vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
>;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
let isCommutable = 1 in {
defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1735,7 +1750,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
VOP_F64_F64_I32, AMDGPUldexp
>;
-} // let SchedRW = [WriteDouble]
+} // let SchedRW = [WriteDoubleAdd]
let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
@@ -1756,16 +1771,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
+ VOP3b_F32_I1_F32_F32_F32
+>;
}
let SchedRW = [WriteDouble, WriteSALU] in {
// Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
+ VOP3b_F64_I1_F64_F64_F64
+>;
} // let SchedRW = [WriteDouble]
-let isCommutable = 1, Uses = [VCC] in {
+let isCommutable = 1, Uses = [VCC, EXEC] in {
+let SchedRW = [WriteFloatFMA] in {
// v_div_fmas_f32:
// result = src0 * src1 + src2
// if (vcc)
@@ -1774,6 +1794,7 @@ let isCommutable = 1, Uses = [VCC] in {
defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
+}
let SchedRW = [WriteDouble] in {
// v_div_fmas_f64:
@@ -1786,7 +1807,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
>;
} // End SchedRW = [WriteDouble]
-} // End isCommutable = 1
+} // End isCommutable = 1, Uses = [VCC, EXEC]
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
@@ -1835,13 +1856,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
(ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
>;
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-let hasSideEffects = 1 in {
+let hasSideEffects = 1, SALU = 1 in {
def SGPR_USE : InstSI <(outs),(ins), "", []>;
}
@@ -1921,39 +1942,9 @@ def SI_KILL : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
+class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
(outs VGPR_32:$dst, SReg_64:$temp),
- (ins FRAMEri32:$addr, i32imm:$chan),
- "", []
-> {
- let isRegisterLoad = 1;
- let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
- outs,
- (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
- "", []
-> {
- let isRegisterStore = 1;
- let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
-def SI_INDIRECT_SRC : InstSI <
- (outs VGPR_32:$dst, SReg_64:$temp),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+ (ins rc:$src, VSrc_32:$idx, i32imm:$off),
"si_indirect_src $dst, $temp, $src, $idx, $off",
[]
>;
@@ -1967,6 +1958,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
let Constraints = "$src = $dst";
}
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
@@ -1977,19 +1975,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
- let UseNamedOperandTable = 1 in {
+ let UseNamedOperandTable = 1, Uses = [EXEC] in {
def _SAVE : InstSI <
(outs),
- (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
- SReg_32:$scratch_offset),
+ (ins sgpr_class:$src, i32imm:$frame_idx),
"", []
- >;
+ > {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
def _RESTORE : InstSI <
(outs sgpr_class:$dst),
- (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+ (ins i32imm:$frame_idx),
"", []
- >;
+ > {
+ let mayStore = 0;
+ let mayLoad = 1;
+ }
} // End UseNamedOperandTable = 1
}
@@ -2003,19 +2006,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
- let UseNamedOperandTable = 1, VGPRSpill = 1 in {
+ let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : InstSI <
(outs),
(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
SReg_32:$scratch_offset),
"", []
- >;
+ > {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
def _RESTORE : InstSI <
(outs vgpr_class:$dst),
(ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
"", []
- >;
+ > {
+ let mayStore = 0;
+ let mayLoad = 1;
+ }
} // End UseNamedOperandTable = 1, VGPRSpill = 1
}
@@ -2030,9 +2039,11 @@ let Defs = [SCC] in {
def SI_CONSTDATA_PTR : InstSI <
(outs SReg_64:$dst),
- (ins),
- "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
->;
+ (ins const_ga:$ptr),
+ "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
+> {
+ let SALU = 1;
+}
} // End Defs = [SCC]
@@ -2072,84 +2083,63 @@ def : Pat <
// SMRD Patterns
//===----------------------------------------------------------------------===//
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
- // 1. SI-CI: Offset as 8bit DWORD immediate
+ // 1. IMM offset
def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
- (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+ (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
>;
- // 2. Offset loaded in an 32bit SGPR
+ // 2. SGPR offset
def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
- (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
>;
- // 3. No offset at all
def : Pat <
- (constant_load i64:$sbase),
- (vt (Instr_IMM $sbase, 0))
- >;
+ (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
+ > {
+ let Predicates = [isCIOnly];
+ }
}
-multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
- // 1. VI: Offset as 20bit immediate in bytes
- def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
- (vt (Instr_IMM $sbase, (as_i32imm $offset)))
- >;
-
- // 2. Offset loaded in an 32bit SGPR
- def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
- (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
- >;
-
- // 3. No offset at all
- def : Pat <
- (constant_load i64:$sbase),
- (vt (Instr_IMM $sbase, 0))
- >;
-}
-
-let Predicates = [isSICI] in {
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isSICI]
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
-let Predicates = [isVI] in {
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isVI]
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-let Predicates = [isSICI] in {
+// 1. Offset as an immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset)
+>;
-// 1. Offset as 8bit DWORD immediate
+// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+ (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset)
>;
-} // End Predicates = [isSICI]
+let Predicates = [isCI] in {
-// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (SIload_constant v4i32:$sbase, imm:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)
>;
+} // End Predicates = [isCI]
+
+} // End let AddedComplexity = 10000
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -2161,6 +2151,11 @@ def : Pat <
(S_MOV_B32 0), sub1))
>;
+def : Pat <
+ (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (S_ABS_I32 $x)
+>;
+
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
@@ -2488,6 +2483,11 @@ def : Pat <
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
+//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
+//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
+//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
+//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
+
foreach Index = 0-2 in {
def Extract_Element_v2i32_#Index : Extract_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2568,11 +2568,25 @@ def : BitConvert <v2i32, i64, VReg_64>;
def : BitConvert <i64, v2i32, VReg_64>;
def : BitConvert <v2f32, i64, VReg_64>;
def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v4f32, v4i32, VReg_128>;
def : BitConvert <v4i32, v4f32, VReg_128>;
+
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+
+
+
+
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8i32, v32i8, SReg_256>;
@@ -2601,10 +2615,9 @@ def : Pat <
// Prevent expanding both fneg and fabs.
-// FIXME: Should use S_OR_B32
def : Pat <
(fneg (fabs f32:$src)),
- (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+ (S_OR_B32 $src, 0x80000000) /* Set sign bit */
>;
// FIXME: Should use S_OR_B32
@@ -2836,10 +2849,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
// -1. For the non-rtn variants, the manual says it does
// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
// will always do the increment so I'm assuming it's the same.
-//
-// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
-// needs to be a VGPR. The SGPR copy pass will fix this, and it's
-// easier since there is no v_mov_b64.
class DSAtomicIncRetPat<DS inst, ValueType vt,
Instruction LoadImm, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
@@ -2855,9 +2864,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
// 32-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
- S_MOV_B32, si_atomic_load_add_local>;
+ V_MOV_B32_e32, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
- S_MOV_B32, si_atomic_load_sub_local>;
+ V_MOV_B32_e32, si_atomic_load_sub_local>;
def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
@@ -2874,9 +2883,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
// 64-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
- S_MOV_B64, si_atomic_load_add_local>;
+ V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
- S_MOV_B64, si_atomic_load_sub_local>;
+ V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
@@ -3019,90 +3028,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
-let SubtargetPredicate = isCI in {
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
- VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
- VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
- VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-// Remaining instructions:
-// FLAT_*
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// S_DCACHE_INV_VOL
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-} // End isCI
-
/********** ====================== **********/
/********** Indirect adressing **********/
/********** ====================== **********/
-multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
// 1. Extract with offset
def : Pat<
- (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
- (SI_INDIRECT_SRC $vec, $idx, imm:$off)
+ (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
>;
// 2. Extract without offset
def : Pat<
- (eltvt (vector_extract vt:$vec, i32:$idx)),
- (SI_INDIRECT_SRC $vec, $idx, 0)
+ (eltvt (extractelt vt:$vec, i32:$idx)),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
>;
// 3. Insert with offset
def : Pat<
- (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
- (IndDst $vec, $idx, imm:$off, $val)
+ (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
>;
// 4. Insert without offset
def : Pat<
- (vector_insert vt:$vec, eltvt:$val, i32:$idx),
- (IndDst $vec, $idx, 0, $val)
+ (insertelt vt:$vec, eltvt:$val, i32:$idx),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
>;
}
-defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
-defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
//===----------------------------------------------------------------------===//
// Conversion Patterns
@@ -3215,12 +3180,12 @@ def : Pat <
def : Pat <
(i1 (trunc i32:$a)),
- (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+ (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1)
>;
def : Pat <
(i1 (trunc i64:$a)),
- (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+ (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1),
(EXTRACT_SUBREG $a, sub0)), 1)
>;
@@ -3301,24 +3266,6 @@ def : Pat <
} // End Predicates = [isSI]
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
- (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
- (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
- (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
- (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
- (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index c319b32111fe..126f6245dfc0 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,6 +103,10 @@ public:
return "SI Lower control flow instructions";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
DebugLoc DL = From.getDebugLoc();
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addOperand(To)
- .addReg(AMDGPU::EXEC);
+ .addOperand(To);
}
void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
@@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
// If the exec mask is non-zero, skip the next two instructions
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(3)
- .addReg(AMDGPU::EXEC);
+ .addImm(3);
// Exec mask is zero: Export to NULL target...
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
@@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
.addReg(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addOperand(MI.getOperand(1))
- .addReg(AMDGPU::EXEC);
+ .addOperand(MI.getOperand(1));
MI.eraseFromParent();
}
@@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
.addImm(0);
}
} else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
.addImm(0)
.addOperand(Op);
}
@@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
.addReg(AMDGPU::VCC_LO);
// Compare the just read M0 value to all possible Idx values
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
- .addReg(AMDGPU::M0)
- .addReg(Idx);
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+ .addReg(AMDGPU::M0)
+ .addReg(Idx);
// Update EXEC, save the original EXEC value to VCC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
@@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(-7)
- .addReg(AMDGPU::EXEC);
+ .addImm(-7);
// Restore EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(Reg)
- .addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
LoadM0(MI, MovRel, Off);
@@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(Reg, RegState::Define)
.addReg(Val)
- .addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
LoadM0(MI, MovRel, Off);
@@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
+ if (TII->isWQM(MI) || TII->isDS(MI))
NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
- if (TII->isFLAT(MI.getOpcode()))
+ if (TII->isFLAT(MI))
NeedFlat = true;
switch (MI.getOpcode()) {
@@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Branch(MI);
break;
- case AMDGPU::SI_INDIRECT_SRC:
+ case AMDGPU::SI_INDIRECT_SRC_V1:
+ case AMDGPU::SI_INDIRECT_SRC_V2:
+ case AMDGPU::SI_INDIRECT_SRC_V4:
+ case AMDGPU::SI_INDIRECT_SRC_V8:
+ case AMDGPU::SI_INDIRECT_SRC_V16:
IndirectSrc(MI);
break;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 67421e231d8d..a2fa5fd93aad 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -48,6 +48,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63d6796..935aad427198 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,10 +29,114 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
- HasSpilledVGPRs(false),
+ ScratchRSrcReg(AMDGPU::NoRegister),
+ ScratchWaveOffsetReg(AMDGPU::NoRegister),
+ PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+ DispatchPtrUserSGPR(AMDGPU::NoRegister),
+ QueuePtrUserSGPR(AMDGPU::NoRegister),
+ KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+ DispatchIDUserSGPR(AMDGPU::NoRegister),
+ FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+ PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+ WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+ PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+ LDSWaveSpillSize(0),
PSInputAddr(0),
NumUserSGPRs(0),
- LDSWaveSpillSize(0) { }
+ NumSystemSGPRs(0),
+ HasSpilledSGPRs(false),
+ HasSpilledVGPRs(false),
+ PrivateSegmentBuffer(false),
+ DispatchPtr(false),
+ QueuePtr(false),
+ DispatchID(false),
+ KernargSegmentPtr(false),
+ FlatScratchInit(false),
+ GridWorkgroupCountX(false),
+ GridWorkgroupCountY(false),
+ GridWorkgroupCountZ(false),
+ WorkGroupIDX(true),
+ WorkGroupIDY(false),
+ WorkGroupIDZ(false),
+ WorkGroupInfo(false),
+ PrivateSegmentWaveByteOffset(false),
+ WorkItemIDX(true),
+ WorkItemIDY(false),
+ WorkItemIDZ(false) {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ const Function *F = MF.getFunction();
+
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+ if (getShaderType() == ShaderType::COMPUTE)
+ KernargSegmentPtr = true;
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
+
+ bool MaySpill = ST.isVGPRSpillingEnabled(this);
+ bool HasStackObjects = FrameInfo->hasStackObjects();
+
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentWaveByteOffset = true;
+
+ if (ST.isAmdHsaOS()) {
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentBuffer = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+ }
+
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+ const SIRegisterInfo &TRI) {
+ PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ NumUserSGPRs += 4;
+ return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+ DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+ QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+ KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return KernargSegmentPtrUserSGPR;
+}
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
MachineFunction *MF,
@@ -53,7 +157,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
- MRI.setPhysRegUsed(LaneVGPR);
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 667da4c8af61..9c528d63bd0e 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,13 +26,83 @@ class MachineRegisterInfo;
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+ // FIXME: This should be removed and getPreloadedValue moved here.
+ friend struct SIRegisterInfo;
void anchor() override;
unsigned TIDReg;
- bool HasSpilledVGPRs;
+
+ // Registers that may be reserved for spilling purposes. These may be the same
+ // as the input registers.
+ unsigned ScratchRSrcReg;
+ unsigned ScratchWaveOffsetReg;
+
+ // Input registers setup for the HSA ABI.
+ // User SGPRs in allocation order.
+ unsigned PrivateSegmentBufferUserSGPR;
+ unsigned DispatchPtrUserSGPR;
+ unsigned QueuePtrUserSGPR;
+ unsigned KernargSegmentPtrUserSGPR;
+ unsigned DispatchIDUserSGPR;
+ unsigned FlatScratchInitUserSGPR;
+ unsigned PrivateSegmentSizeUserSGPR;
+ unsigned GridWorkGroupCountXUserSGPR;
+ unsigned GridWorkGroupCountYUserSGPR;
+ unsigned GridWorkGroupCountZUserSGPR;
+
+ // System SGPRs in allocation order.
+ unsigned WorkGroupIDXSystemSGPR;
+ unsigned WorkGroupIDYSystemSGPR;
+ unsigned WorkGroupIDZSystemSGPR;
+ unsigned WorkGroupInfoSystemSGPR;
+ unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
public:
+ // FIXME: Make private
+ unsigned LDSWaveSpillSize;
+ unsigned PSInputAddr;
+ std::map<unsigned, unsigned> LaneVGPRs;
+ unsigned ScratchOffsetReg;
+ unsigned NumUserSGPRs;
+ unsigned NumSystemSGPRs;
+
+private:
+ bool HasSpilledSGPRs;
+ bool HasSpilledVGPRs;
+
+ // Feature bits required for inputs passed in user SGPRs.
+ bool PrivateSegmentBuffer : 1;
+ bool DispatchPtr : 1;
+ bool QueuePtr : 1;
+ bool DispatchID : 1;
+ bool KernargSegmentPtr : 1;
+ bool FlatScratchInit : 1;
+ bool GridWorkgroupCountX : 1;
+ bool GridWorkgroupCountY : 1;
+ bool GridWorkgroupCountZ : 1;
+
+ // Feature bits required for inputs passed in system SGPRs.
+ bool WorkGroupIDX : 1; // Always initialized.
+ bool WorkGroupIDY : 1;
+ bool WorkGroupIDZ : 1;
+ bool WorkGroupInfo : 1;
+ bool PrivateSegmentWaveByteOffset : 1;
+
+ bool WorkItemIDX : 1; // Always initialized.
+ bool WorkItemIDY : 1;
+ bool WorkItemIDZ : 1;
+
+ MCPhysReg getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+ }
+
+ MCPhysReg getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+ }
+
+public:
struct SpilledReg {
unsigned VGPR;
int Lane;
@@ -46,16 +116,162 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
unsigned SubIdx);
- unsigned PSInputAddr;
- unsigned NumUserSGPRs;
- std::map<unsigned, unsigned> LaneVGPRs;
- unsigned LDSWaveSpillSize;
- unsigned ScratchOffsetReg;
bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
- bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
- void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+ // Add user SGPRs.
+ unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+ unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+ unsigned addQueuePtr(const SIRegisterInfo &TRI);
+ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+ // Add system SGPRs.
+ unsigned addWorkGroupIDX() {
+ WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDXSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDY() {
+ WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDYSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDZ() {
+ WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDZSystemSGPR;
+ }
+
+ unsigned addWorkGroupInfo() {
+ WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupInfoSystemSGPR;
+ }
+
+ unsigned addPrivateSegmentWaveByteOffset() {
+ PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ bool hasPrivateSegmentBuffer() const {
+ return PrivateSegmentBuffer;
+ }
+
+ bool hasDispatchPtr() const {
+ return DispatchPtr;
+ }
+
+ bool hasQueuePtr() const {
+ return QueuePtr;
+ }
+
+ bool hasDispatchID() const {
+ return DispatchID;
+ }
+
+ bool hasKernargSegmentPtr() const {
+ return KernargSegmentPtr;
+ }
+
+ bool hasFlatScratchInit() const {
+ return FlatScratchInit;
+ }
+
+ bool hasGridWorkgroupCountX() const {
+ return GridWorkgroupCountX;
+ }
+
+ bool hasGridWorkgroupCountY() const {
+ return GridWorkgroupCountY;
+ }
+
+ bool hasGridWorkgroupCountZ() const {
+ return GridWorkgroupCountZ;
+ }
+
+ bool hasWorkGroupIDX() const {
+ return WorkGroupIDX;
+ }
+
+ bool hasWorkGroupIDY() const {
+ return WorkGroupIDY;
+ }
+
+ bool hasWorkGroupIDZ() const {
+ return WorkGroupIDZ;
+ }
+
+ bool hasWorkGroupInfo() const {
+ return WorkGroupInfo;
+ }
+
+ bool hasPrivateSegmentWaveByteOffset() const {
+ return PrivateSegmentWaveByteOffset;
+ }
+
+ bool hasWorkItemIDX() const {
+ return WorkItemIDX;
+ }
+
+ bool hasWorkItemIDY() const {
+ return WorkItemIDY;
+ }
+
+ bool hasWorkItemIDZ() const {
+ return WorkItemIDZ;
+ }
+
+ unsigned getNumUserSGPRs() const {
+ return NumUserSGPRs;
+ }
+
+ unsigned getNumPreloadedSGPRs() const {
+ return NumUserSGPRs + NumSystemSGPRs;
+ }
+
+ unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ /// \brief Returns the physical register reserved for use as the resource
+ /// descriptor for scratch accesses.
+ unsigned getScratchRSrcReg() const {
+ return ScratchRSrcReg;
+ }
+
+ void setScratchRSrcReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchRSrcReg = Reg;
+ }
+
+ unsigned getScratchWaveOffsetReg() const {
+ return ScratchWaveOffsetReg;
+ }
+
+ void setScratchWaveOffsetReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchWaveOffsetReg = Reg;
+ }
+
+ bool hasSpilledSGPRs() const {
+ return HasSpilledSGPRs;
+ }
+
+ void setHasSpilledSGPRs(bool Spill = true) {
+ HasSpilledSGPRs = Spill;
+ }
+
+ bool hasSpilledVGPRs() const {
+ return HasSpilledVGPRs;
+ }
+
+ void setHasSpilledVGPRs(bool Spill = true) {
+ HasSpilledVGPRs = Spill;
+ }
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index 2cd600df2268..000000000000
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program. These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
- static char ID;
-
-public:
- SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- const char *getPassName() const override {
- return "SI prepare scratch registers";
- }
-
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
- return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
- const SIRegisterInfo *TRI = &TII->getRegisterInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineFrameInfo *FrameInfo = MF.getFrameInfo();
- MachineBasicBlock *Entry = MF.begin();
- MachineBasicBlock::iterator I = Entry->begin();
- DebugLoc DL = I->getDebugLoc();
-
- // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
- // run this pass.
- if (!MFI->hasSpilledVGPRs())
- return false;
-
- unsigned ScratchPtrPreloadReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
- unsigned ScratchOffsetPreloadReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
- if (!Entry->isLiveIn(ScratchPtrPreloadReg))
- Entry->addLiveIn(ScratchPtrPreloadReg);
-
- if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
- Entry->addLiveIn(ScratchOffsetPreloadReg);
-
- // Load the scratch offset.
- unsigned ScratchOffsetReg =
- TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
- int ScratchOffsetFI = -1;
-
- if (ScratchOffsetReg != AMDGPU::NoRegister) {
- // Found an SGPR to use
- MRI.setPhysRegUsed(ScratchOffsetReg);
- BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
- .addReg(ScratchOffsetPreloadReg);
- } else {
- // No SGPR is available, we must spill.
- ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
- BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
- .addReg(ScratchOffsetPreloadReg)
- .addFrameIndex(ScratchOffsetFI)
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- }
-
-
- // Now that we have the scratch pointer and offset values, we need to
- // add them to all the SI_SPILL_V* instructions.
-
- RegScavenger RS;
- unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
- RS.addScavengingFrameIndex(ScratchRsrcFI);
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
-
- MachineBasicBlock &MBB = *BI;
- // Add the scratch offset reg as a live-in so that the register scavenger
- // doesn't re-use it.
- if (!MBB.isLiveIn(ScratchOffsetReg) &&
- ScratchOffsetReg != AMDGPU::NoRegister)
- MBB.addLiveIn(ScratchOffsetReg);
- RS.enterBasicBlock(&MBB);
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- MachineInstr &MI = *I;
- RS.forward(I);
- DebugLoc DL = MI.getDebugLoc();
- if (!TII->isVGPRSpill(MI.getOpcode()))
- continue;
-
- // Scratch resource
- unsigned ScratchRsrcReg =
- RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
- uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
- unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
- .addExternalSymbol("SCRATCH_RSRC_DWORD0")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
- .addExternalSymbol("SCRATCH_RSRC_DWORD1")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
- .addImm(Rsrc23 & 0xffffffff)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
- .addImm(Rsrc23 >> 32)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- // Scratch Offset
- if (ScratchOffsetReg == AMDGPU::NoRegister) {
- ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
- ScratchOffsetReg)
- .addFrameIndex(ScratchOffsetFI)
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
- MBB.addLiveIn(ScratchOffsetReg);
- }
-
- if (ScratchRsrcReg == AMDGPU::NoRegister ||
- ScratchOffsetReg == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF.getFunction()->getContext();
- Ctx.emitError("ran out of SGPRs for spilling VGPRs");
- ScratchRsrcReg = AMDGPU::SGPR0;
- ScratchOffsetReg = AMDGPU::SGPR0;
- }
- MI.getOperand(2).setReg(ScratchRsrcReg);
- MI.getOperand(2).setIsKill(true);
- MI.getOperand(2).setIsUndef(false);
- MI.getOperand(3).setReg(ScratchOffsetReg);
- MI.getOperand(3).setIsUndef(false);
- MI.getOperand(3).setIsKill(false);
- MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
- }
- }
- return true;
-}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e9e8412e263d..3cdffef05583 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-
#include "SIRegisterInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
Reserved.set(*R);
}
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+ // next sgpr128 down.
+ return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+ }
+
+ return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Next register before reservations for flat_scr and vcc.
+ return AMDGPU::SGPR97;
+ }
+
+ return AMDGPU::SGPR95;
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
- // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
- reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
- reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
+ // Reserve the last 2 registers so we will always have at least 2 more that
+ // will physically contain VCC.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
+ // for VCC/FLAT_SCR.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+ }
// Tonga and Iceland can only allocate a fixed number of SGPRs due
// to a hw bug.
- if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+ if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
// Assume XNACK_MASK is unused.
@@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+ // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+ reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+ }
+
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg != AMDGPU::NoRegister) {
+ // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+ // to spill.
+ // TODO: May need to reserve a VGPR if doing LDS spilling.
+ reserveRegisterTuples(Reserved, ScratchRSrcReg);
+ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+ }
+
return Reserved;
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
-
const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
// FIXME: We should adjust the max number of waves based on LDS size.
unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
STI.getMaxWavesPerCU());
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
+ unsigned VSLimit = SGPRLimit + VGPRLimit;
+
for (regclass_iterator I = regclass_begin(), E = regclass_end();
I != E; ++I) {
+ const TargetRegisterClass *RC = *I;
- unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
unsigned Limit;
- if (isSGPRClass(*I)) {
+ if (isPseudoRegClass(RC)) {
+ // FIXME: This is a hack. We should never be considering the pressure of
+ // these since no virtual register should ever have this class.
+ Limit = VSLimit;
+ } else if (isSGPRClass(RC)) {
Limit = SGPRLimit / NumSubRegs;
} else {
Limit = VGPRLimit / NumSubRegs;
}
- const int *Sets = getRegClassPressureSets(*I);
+ const int *Sets = getRegClassPressureSets(RC);
assert(Sets);
for (unsigned i = 0; Sets[i] != -1; ++i) {
- if (Sets[i] == (int)Idx)
+ if (Sets[i] == (int)Idx)
return Limit;
}
}
@@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
- bool IsKill = (i == e - 1);
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(IsLoad))
- .addReg(ScratchRsrcReg, getKillRegState(IsKill))
- .addReg(SOffset)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+ .addReg(SubReg, getDefRegState(IsLoad))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
}
}
@@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(SubReg)
.addImm(Spill.Lane);
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
}
MI->eraseFromParent();
break;
@@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// TODO: only do this when it is needed
switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
case AMDGPUSubtarget::SOUTHERN_ISLANDS:
- // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
- TII->insertNOPs(MI, 3);
+ // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
+ // ("S_NOP 3") on SI
+ TII->insertWaitStates(MI, 4);
break;
case AMDGPUSubtarget::SEA_ISLANDS:
break;
default: // VOLCANIC_ISLANDS and later
- // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
- // and later. This also applies to VALUs which write VCC, but we're
- // unlikely to see VMEM use VCC.
- TII->insertNOPs(MI, 4);
+ // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
+ // ("S_NOP 4") on VI and later. This also applies to VALUs which write
+ // VCC, but we're unlikely to see VMEM use VCC.
+ TII->insertWaitStates(MI, 5);
}
MI->eraseFromParent();
@@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
}
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
- MVT VT) const {
- switch(VT.SimpleTy) {
- default:
- case MVT::i32: return &AMDGPU::VGPR_32RegClass;
- }
-}
-
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg) & 0xff;
}
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
- static const TargetRegisterClass *BaseClasses[] = {
+ static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,
@@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
return nullptr;
}
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+ switch (RC->getSize()) {
+ case 4:
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+ case 8:
+ return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+ case 12:
+ return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+ case 16:
+ return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 32:
+ return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const {
- if (hasVGPRs(SRC)) {
- return SRC;
- } else if (SRC == &AMDGPU::SCCRegRegClass) {
- return &AMDGPU::VCCRegRegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
- return &AMDGPU::VGPR_32RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
- return &AMDGPU::VReg_64RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
- return &AMDGPU::VReg_128RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
- return &AMDGPU::VReg_256RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
- return &AMDGPU::VReg_512RegClass;
- }
- return nullptr;
+ switch (SRC->getSize()) {
+ case 4:
+ return &AMDGPU::VGPR_32RegClass;
+ case 8:
+ return &AMDGPU::VReg_64RegClass;
+ case 12:
+ return &AMDGPU::VReg_96RegClass;
+ case 16:
+ return &AMDGPU::VReg_128RegClass;
+ case 32:
+ return &AMDGPU::VReg_256RegClass;
+ case 64:
+ return &AMDGPU::VReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::shouldRewriteCopySrc(
+ const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // We want to prefer the smallest register class possible, so we don't want to
+ // stop and rewrite on anything that looks like a subregister
+ // extract. Operations mostly don't care about the super register class, so we
+ // only want to stop on the most basic of copies between the smae register
+ // class.
+ //
+ // e.g. if we have something like
+ // vreg0 = ...
+ // vreg1 = ...
+ // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+ // vreg3 = COPY vreg2, sub0
+ //
+ // We want to look through the COPY to find:
+ // => vreg3 = COPY vreg0
+
+ // Plain copy.
+ return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
const TargetRegisterClass *SubRC,
unsigned Channel) const {
@@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ (void)ST;
switch (Value) {
- case SIRegisterInfo::TGID_X:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
- case SIRegisterInfo::TGID_Y:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
- case SIRegisterInfo::TGID_Z:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
- case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
- if (MFI->getShaderType() != ShaderType::COMPUTE)
- return MFI->ScratchOffsetReg;
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
- case SIRegisterInfo::SCRATCH_PTR:
- return AMDGPU::SGPR2_SGPR3;
- case SIRegisterInfo::INPUT_PTR:
- return AMDGPU::SGPR0_SGPR1;
- case SIRegisterInfo::TIDIG_X:
+ case SIRegisterInfo::WORKGROUP_ID_X:
+ assert(MFI->hasWorkGroupIDX());
+ return MFI->WorkGroupIDXSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Y:
+ assert(MFI->hasWorkGroupIDY());
+ return MFI->WorkGroupIDYSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Z:
+ assert(MFI->hasWorkGroupIDZ());
+ return MFI->WorkGroupIDZSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+ return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+ assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+ assert(MFI->hasKernargSegmentPtr());
+ return MFI->KernargSegmentPtrUserSGPR;
+ case SIRegisterInfo::DISPATCH_PTR:
+ assert(MFI->hasDispatchPtr());
+ return MFI->DispatchPtrUserSGPR;
+ case SIRegisterInfo::QUEUE_PTR:
+ llvm_unreachable("not implemented");
+ case SIRegisterInfo::WORKITEM_ID_X:
+ assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
- case SIRegisterInfo::TIDIG_Y:
+ case SIRegisterInfo::WORKITEM_ID_Y:
+ assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
- case SIRegisterInfo::TIDIG_Z:
+ case SIRegisterInfo::WORKITEM_ID_Z:
+ assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
@@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
// AMDGPU::NoRegister.
unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC) const {
-
- for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
- I != E; ++I) {
- if (!MRI.isPhysRegUsed(*I))
- return *I;
- }
+ for (unsigned Reg : *RC)
+ if (!MRI.isPhysRegUsed(Reg))
+ return Reg;
return AMDGPU::NoRegister;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 7da6de282c11..1795237c2140 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,6 +18,7 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
namespace llvm {
@@ -29,6 +30,15 @@ private:
public:
SIRegisterInfo();
+ /// Return the end register initially reserved for the scratch buffer in case
+ /// spilling is needed.
+ unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+ /// Return the end register initially reserved for the scratch wave offset in
+ /// case spilling is needed.
+ unsigned reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
unsigned getRegPressureSetLimit(const MachineFunction &MF,
@@ -40,10 +50,6 @@ public:
unsigned FIOperandNum,
RegScavenger *RS) const override;
- /// \brief get the register class of the specified type to use in the
- /// CFGStructurizer
- const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief Return the 'base' register class for this register.
@@ -52,23 +58,30 @@ public:
/// \returns true if this class contains only SGPR registers
bool isSGPRClass(const TargetRegisterClass *RC) const {
- if (!RC)
- return false;
-
return !hasVGPRs(RC);
}
/// \returns true if this class ID contains only SGPR registers
bool isSGPRClassID(unsigned RCID) const {
- if (static_cast<int>(RCID) == -1)
- return false;
-
return isSGPRClass(getRegClass(RCID));
}
+ bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return isSGPRClass(MRI.getRegClass(Reg));
+ return getPhysRegClass(Reg);
+ }
+
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
+ /// returns true if this is a pseudoregister class combination of VGPRs and
+ /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on
+ /// them.
+ static bool isPseudoRegClass(const TargetRegisterClass *RC) {
+ return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass;
+ }
+
/// \returns A VGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
@@ -79,6 +92,11 @@ public:
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
/// \p Channel This is the register channel (e.g. a value from 0-16), not the
/// SubReg index.
/// \returns The sub-register of Reg that is in Channel.
@@ -91,19 +109,25 @@ public:
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
- /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
+ /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
bool opCanUseInlineConstant(unsigned OpType) const;
enum PreloadedValue {
- TGID_X,
- TGID_Y,
- TGID_Z,
- SCRATCH_WAVE_OFFSET,
- SCRATCH_PTR,
- INPUT_PTR,
- TIDIG_X,
- TIDIG_Y,
- TIDIG_Z
+ // SGPRS:
+ PRIVATE_SEGMENT_BUFFER = 0,
+ DISPATCH_PTR = 1,
+ QUEUE_PTR = 2,
+ KERNARG_SEGMENT_PTR = 3,
+ WORKGROUP_ID_X = 10,
+ WORKGROUP_ID_Y = 11,
+ WORKGROUP_ID_Z = 12,
+ PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+ // VGPRS:
+ FIRST_VGPR_VALUE = 15,
+ WORKITEM_ID_X = FIRST_VGPR_VALUE,
+ WORKITEM_ID_Y = 16,
+ WORKITEM_ID_Z = 17
};
/// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index 2a9017fa2a98..bfaf93709d8c 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,10 +10,13 @@
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-
-class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+ DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
- let HWEncoding = encoding;
+
+ // This is the not yet the complete register encoding. An additional
+ // bit is set for VGPRs.
+ let HWEncoding = regIdx;
}
// Special Registers
@@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+ DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 106;
@@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+ DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 126;
@@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
def SCC : SIReg<"scc", 253>;
def M0 : SIReg <"m0", 124>;
-def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
-def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+ def _ci : SIReg<n, ci_e>;
+ def _vi : SIReg<n, vi_e>;
+ def "" : SIReg<"", 0>;
+}
-// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 104;
+ let HWEncoding = encoding;
}
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
// SGPR registers
-foreach Index = 0-101 in {
+foreach Index = 0-103 in {
def SGPR#Index : SIReg <"SGPR"#Index, Index>;
}
@@ -65,25 +81,27 @@ foreach Index = 0-255 in {
// Groupings using register classes and tuples
//===----------------------------------------------------------------------===//
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
- (add (sequence "SGPR%u", 0, 101))>;
+ (add (sequence "SGPR%u", 0, 103))>;
// SGPR 64-bit registers
def SGPR_64Regs : RegisterTuples<[sub0, sub1],
- [(add (decimate (trunc SGPR_32, 101), 2)),
+ [(add (decimate SGPR_32, 2)),
(add (decimate (shl SGPR_32, 1), 2))]>;
// SGPR 128-bit registers
def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
- [(add (decimate (trunc SGPR_32, 99), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4))]>;
// SGPR 256-bit registers
def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
- [(add (decimate (trunc SGPR_32, 95), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4)),
@@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
// SGPR 512-bit registers
def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
- [(add (decimate (trunc SGPR_32, 87), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4)),
@@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass {
let RenderMethod = "addRegOrImmOperands";
}
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
- let CopyCost = -1; // Theoretically it is possible to read from SCC,
- // but it should never be necessary.
-}
-
-def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
-
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
>;
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
- (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+ (add SGPR_64, VCC, EXEC, FLAT_SCR)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
+ // Requires 2 s_mov_b64 to copy
+ let CopyCost = 2;
+}
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+ // Requires 4 s_mov_b64 to copy
+ let CopyCost = 4;
+}
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+ // Requires 8 s_mov_b64 to copy
+ let CopyCost = 8;
+}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+ // Requires 2 v_mov_b32 to copy
+ let CopyCost = 2;
+}
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
let Size = 96;
+
+ // Requires 3 v_mov_b32 to copy
+ let CopyCost = 3;
}
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+ // Requires 4 v_mov_b32 to copy
+ let CopyCost = 4;
+}
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+ let CopyCost = 8;
+}
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+ let CopyCost = 16;
+}
def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
let Size = 32;
@@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> {
def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+ let CopyCost = 2;
+}
def VSrc_32 : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
@@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> {
let OperandType = "OPERAND_REG_INLINE_C";
let ParserMatchClass = RegImmMatcher<"VCSrc64">;
}
+
+//===----------------------------------------------------------------------===//
+// SCSrc_* Operands with an SGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_64 : RegisterOperand<SReg_64> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_INLINE_C";
+ let ParserMatchClass = RegImmMatcher<"SCSrc64">;
+}
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 9b1f676020bf..cd77e519abb2 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -17,16 +17,28 @@ def WriteLDS : SchedWrite;
def WriteSALU : SchedWrite;
def WriteSMEM : SchedWrite;
def WriteVMEM : SchedWrite;
+def WriteBarrier : SchedWrite;
// Vector ALU instructions
def Write32Bit : SchedWrite;
def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
def WriteFloatFMA : SchedWrite;
-def WriteDouble : SchedWrite;
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
def WriteDoubleAdd : SchedWrite;
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
def SIFullSpeedModel : SchedMachineModel;
def SIQuarterSpeedModel : SchedMachineModel;
@@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
// The latency numbers are taken from AMD Accelerated Parallel Processing
-// guide. They may not be acurate.
+// guide. They may not be accurate.
// The latency values are 1 / (operations / cycle) / 4.
multiclass SICommonWriteRes {
@@ -64,8 +76,10 @@ multiclass SICommonWriteRes {
def : HWWriteRes<WriteSALU, [HWSALU], 1>;
def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+ def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
}
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 5d00bdd6a9bb..4f0913fe62f2 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
if (!MRI.isSSA())
return;
- assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
- TII->isVOPC(MI.getOpcode()));
+ assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
const SIRegisterInfo &TRI = TII->getRegisterInfo();
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
}
+// Copy MachineOperand with all flags except setting it as implicit.
+static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
+ assert(!Orig.isImplicit());
+ return MachineOperand::CreateReg(Orig.getReg(),
+ Orig.isDef(),
+ true,
+ Orig.isKill(),
+ Orig.isDead(),
+ Orig.isUndef(),
+ Orig.isEarlyClobber(),
+ Orig.getSubReg(),
+ Orig.isDebug(),
+ Orig.isInternalRead());
+}
+
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII =
@@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (TII->isVOPC(Op32)) {
unsigned DstReg = MI.getOperand(0).getReg();
if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
- // VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because the register allocator has
- // trouble with sequences like this, which cause the allocator to run
- // out of registers if vreg0 and vreg1 belong to the VCCReg register
- // class:
- // vreg0 = VOPC;
- // vreg1 = VOPC;
- // S_AND_B64 vreg0, vreg1
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
//
// So, instead of forcing the instruction to write to VCC, we provide
// a hint to the register allocator to use VCC and then we we will run
@@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
// We can shrink this instruction
- DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+ DEBUG(dbgs() << "Shrinking " << MI);
MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
- // dst
- Inst32.addOperand(MI.getOperand(0));
+ // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+ // For VOPC instructions, this is replaced by an implicit def of vcc.
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+ if (Op32DstIdx != -1) {
+ // dst
+ Inst32.addOperand(MI.getOperand(0));
+ } else {
+ assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ "Unexpected case");
+ }
+
Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
@@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
Inst32.addOperand(*Src1);
const MachineOperand *Src2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- if (Src2)
- Inst32.addOperand(*Src2);
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2) {
+ int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+ if (Op32Src2Idx != -1) {
+ Inst32.addOperand(*Src2);
+ } else {
+ // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+ // replaced with an implicit read of vcc.
+ assert(Src2->getReg() == AMDGPU::VCC &&
+ "Unexpected missing register operand");
+ Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+ }
+ }
++NumInstructionsShrunk;
MI.eraseFromParent();
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
index 591ce857cc7d..dbdc76b917f3 100644
--- a/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -22,6 +22,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
}
bool SITypeRewriter::runOnFunction(Function &F) {
- Attribute A = F.getFnAttribute("ShaderType");
-
- unsigned ShaderType = ShaderType::COMPUTE;
- if (A.isStringAttribute()) {
- StringRef Str = A.getValueAsString();
- Str.getAsInteger(0, ShaderType);
- }
- if (ShaderType == ShaderType::COMPUTE)
+ if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
return false;
visit(F);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b76b4007003f..add415e215cf 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,12 +7,23 @@
//
//===----------------------------------------------------------------------===//
#include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SubtargetFeature.h"
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
#undef GET_SUBTARGETINFO_ENUM
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
namespace llvm {
namespace AMDGPU {
@@ -56,5 +67,91 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.private_segment_alignment = 4;
}
+MCSection *getHSATextSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_EXECINSTR |
+ ELF::SHF_AMDGPU_HSA_AGENT |
+ ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+static const char ShaderTypeAttribute[] = "ShaderType";
+
+unsigned getShaderType(const Function &F) {
+ Attribute A = F.getFnAttribute(ShaderTypeAttribute);
+ unsigned ShaderType = ShaderType::COMPUTE;
+
+ if (A.isStringAttribute()) {
+ StringRef Str = A.getValueAsString();
+ if (Str.getAsInteger(0, ShaderType)) {
+ LLVMContext &Ctx = F.getContext();
+ Ctx.emitError("can't parse shader type");
+ }
+ }
+ return ShaderType;
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+ switch(Reg) {
+ default: break;
+ case AMDGPU::FLAT_SCR:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+ case AMDGPU::FLAT_SCR_LO:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+ case AMDGPU::FLAT_SCR_HI:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+ }
+ return Reg;
+}
+
} // End namespace AMDGPU
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f57028cc5bfd..19419a29f5e0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,11 @@
namespace llvm {
class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCSection;
+class MCSubtargetInfo;
namespace AMDGPU {
@@ -27,6 +32,27 @@ struct IsaVersion {
IsaVersion getIsaVersion(const FeatureBitset &Features);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+unsigned getShaderType(const Function &F);
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/LLVMBuild.txt b/lib/Target/AMDGPU/Utils/LLVMBuild.txt
index dec5360e3bc7..2453bc546b99 100644
--- a/lib/Target/AMDGPU/Utils/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/Utils/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = AMDGPUUtils
parent = AMDGPU
-required_libraries = Support
+required_libraries = Core MC Support
add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index aca46732adb9..20a026a822e2 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
} // End isCommutable = 1
defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
-// Aliases to simplify matching of floating-pint instructions that are VOP2 on
-// SI and VOP3 on VI.
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
name#" $dst, $src0, $src1",
@@ -89,60 +89,15 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
//===----------------------------------------------------------------------===//
-// SMEM Patterns
+// SMEM Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isVI] in {
+def S_DCACHE_WB : SMEM_Inval <0x21,
+ "s_dcache_wb", int_amdgcn_s_dcache_wb>;
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
- (SIload_constant v4i32:$sbase, IMM20bit:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
-
-// Patterns for global loads with no offset
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, i64:$addr),
- (inst $data, $addr, 0, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr, vt:$data)),
- (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
+ "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-} // End Predicates = [isVI]
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 9550a3a3cad1..cd7540e52410 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -35,7 +35,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
FunctionPass *createA15SDOptimizerPass();
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMExpandPseudoPass();
-FunctionPass *createARMGlobalBaseRegPass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ef609a66d032..a44dc830a673 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -17,6 +17,17 @@
include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
+// ARM Helper classes.
+//
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+class Architecture<string fname, string aname, list<SubtargetFeature> features >
+ : SubtargetFeature<fname, "ARMArch", aname,
+ !strconcat(aname, " architecture"), features>;
+
+//===----------------------------------------------------------------------===//
// ARM Subtarget state.
//
@@ -51,8 +62,11 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
"true", "Enable ARMv8 FP",
[FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+ "Enable full half-precision floating point",
+ [FeatureFPARMv8]>;
def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true",
- "Restrict VFP3 to 16 double registers">;
+ "Restrict FP to 16 double registers">;
def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
"Enable divide instructions">;
def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm",
@@ -119,9 +133,9 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
"Has return address stack">;
-/// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
-def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
- "Supports v7 DSP instructions in Thumb2">;
+/// DSP extension.
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
+ "Supports DSP instructions in ARM and/or Thumb2">;
// Multiprocessing extension.
def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
@@ -150,11 +164,28 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
"NaCl trap">;
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+ "StrictAlign", "true",
+ "Disallow all unaligned memory "
+ "access">;
+
def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
"Generate calls via indirect call "
"instructions">;
-// ARM ISAs.
+def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+ "Reserve R9, making it unavailable as "
+ "GPR">;
+
+def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
+ "Don't use movt/movw pairs for 32-bit "
+ "imms">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM ISAa.
+//
+
def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true",
"Support ARM v4T instructions">;
def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true",
@@ -180,302 +211,444 @@ def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true",
[HasV6T2Ops, FeaturePerfMon]>;
def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true",
"Support ARM v8 instructions",
- [HasV7Ops, FeatureVirtualization,
- FeatureMP]>;
+ [HasV7Ops]>;
def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
"Support ARM v8.1a instructions",
- [HasV8Ops, FeatureAClass, FeatureCRC]>;
+ [HasV8Ops]>;
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+ "Support ARM v8.2a instructions",
+ [HasV8_1aOps]>;
+
//===----------------------------------------------------------------------===//
-// ARM Processors supported.
+// ARM Processor subtarget features.
//
-include "ARMSchedule.td"
-
-// ARM processor families.
def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
- "Cortex-A5 ARM processors",
- [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
- FeatureVMLxForwarding, FeatureT2XtPk,
- FeatureTrustZone, FeatureMP]>;
+ "Cortex-A5 ARM processors", []>;
def ProcA7 : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
- "Cortex-A7 ARM processors",
- [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
- FeatureVMLxForwarding, FeatureT2XtPk,
- FeatureVFP4, FeatureMP,
- FeatureHWDiv, FeatureHWDivARM,
- FeatureTrustZone, FeatureVirtualization]>;
+ "Cortex-A7 ARM processors", []>;
def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
- "Cortex-A8 ARM processors",
- [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
- FeatureVMLxForwarding, FeatureT2XtPk,
- FeatureTrustZone]>;
+ "Cortex-A8 ARM processors", []>;
def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
- "Cortex-A9 ARM processors",
- [FeatureVMLxForwarding,
- FeatureT2XtPk, FeatureFP16,
- FeatureAvoidPartialCPSR,
- FeatureTrustZone]>;
-def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
- "Swift ARM processors",
- [FeatureNEONForFP, FeatureT2XtPk,
- FeatureVFP4, FeatureMP, FeatureHWDiv,
- FeatureHWDivARM, FeatureAvoidPartialCPSR,
- FeatureAvoidMOVsShOp,
- FeatureHasSlowFPVMLx, FeatureTrustZone]>;
+ "Cortex-A9 ARM processors", []>;
def ProcA12 : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
- "Cortex-A12 ARM processors",
- [FeatureVMLxForwarding,
- FeatureT2XtPk, FeatureVFP4,
- FeatureHWDiv, FeatureHWDivARM,
- FeatureAvoidPartialCPSR,
- FeatureVirtualization,
- FeatureTrustZone]>;
-
-
-// FIXME: It has not been determined if A15 has these features.
-def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
- "Cortex-A15 ARM processors",
- [FeatureT2XtPk, FeatureVFP4,
- FeatureMP, FeatureHWDiv, FeatureHWDivARM,
- FeatureAvoidPartialCPSR,
- FeatureTrustZone, FeatureVirtualization]>;
-
+ "Cortex-A12 ARM processors", []>;
+def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+ "Cortex-A15 ARM processors", []>;
def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
- "Cortex-A17 ARM processors",
- [FeatureVMLxForwarding,
- FeatureT2XtPk, FeatureVFP4,
- FeatureHWDiv, FeatureHWDivARM,
- FeatureAvoidPartialCPSR,
- FeatureVirtualization,
- FeatureTrustZone]>;
-
+ "Cortex-A17 ARM processors", []>;
+def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+ "Cortex-A35 ARM processors", []>;
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
- "Cortex-A53 ARM processors",
- [FeatureHWDiv, FeatureHWDivARM,
- FeatureTrustZone, FeatureT2XtPk,
- FeatureCrypto, FeatureCRC]>;
-
+ "Cortex-A53 ARM processors", []>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
- "Cortex-A57 ARM processors",
- [FeatureHWDiv, FeatureHWDivARM,
- FeatureTrustZone, FeatureT2XtPk,
- FeatureCrypto, FeatureCRC]>;
+ "Cortex-A57 ARM processors", []>;
+def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+ "Cortex-A72 ARM processors", []>;
-def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
- "Cortex-R4 ARM processors",
- [FeatureHWDiv,
- FeatureAvoidPartialCPSR,
- FeatureDSPThumb2, FeatureT2XtPk,
- HasV7Ops, FeatureDB, FeatureHasRAS,
- FeatureRClass]>;
+def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+ "Qualcomm ARM processors", []>;
+def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+ "Swift ARM processors", []>;
+
+def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
+ "Cortex-R4 ARM processors", []>;
def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
- "Cortex-R5 ARM processors",
- [FeatureSlowFPBrcc,
- FeatureHWDiv, FeatureHWDivARM,
- FeatureHasSlowFPVMLx,
- FeatureAvoidPartialCPSR,
- FeatureT2XtPk]>;
-
-// FIXME: krait has currently the same features as A9
-// plus VFP4 and hardware division features.
-def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
- "Qualcomm ARM processors",
- [FeatureVMLxForwarding,
- FeatureT2XtPk, FeatureFP16,
- FeatureAvoidPartialCPSR,
- FeatureTrustZone,
- FeatureVFP4,
- FeatureHWDiv,
- FeatureHWDivARM]>;
+ "Cortex-R5 ARM processors", []>;
+def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
+ "Cortex-R7 ARM processors", []>;
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//
+
+include "ARMSchedule.td"
+
+
+//===----------------------------------------------------------------------===//
+// ARM architectures
+//
+
+def ARMv2 : Architecture<"armv2", "ARMv2", []>;
+
+def ARMv2a : Architecture<"armv2a", "ARMv2a", []>;
+
+def ARMv3 : Architecture<"armv3", "ARMv3", []>;
+
+def ARMv3m : Architecture<"armv3m", "ARMv3m", []>;
+
+def ARMv4 : Architecture<"armv4", "ARMv4", []>;
+
+def ARMv4t : Architecture<"armv4t", "ARMv4t", [HasV4TOps]>;
+
+def ARMv5t : Architecture<"armv5t", "ARMv5t", [HasV5TOps]>;
+
+def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>;
+
+def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>;
+
+def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>;
+
+def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops,
+ FeatureDSP]>;
+
+def ARMv6k : Architecture<"armv6k", "ARMv6k", [HasV6KOps]>;
+
+def ARMv6kz : Architecture<"armv6kz", "ARMv6kz", [HasV6KOps,
+ FeatureTrustZone]>;
+
+def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureMClass]>;
+
+def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureMClass]>;
+
+def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops,
+ FeatureNEON,
+ FeatureDB,
+ FeatureDSP,
+ FeatureAClass]>;
+
+def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops,
+ FeatureDB,
+ FeatureDSP,
+ FeatureHWDiv,
+ FeatureRClass]>;
+
+def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops,
+ FeatureThumb2,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureMClass]>;
+
+def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops,
+ FeatureThumb2,
+ FeatureNoARM,
+ FeatureDB,
+ FeatureHWDiv,
+ FeatureMClass,
+ FeatureDSP,
+ FeatureT2XtPk]>;
+
+def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+// Aliases
+def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>;
+def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>;
+def XScale : Architecture<"xscale", "ARMv5te", [ARMv5te]>;
+def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>;
+def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>;
+def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
+
+// Dummy CPU, used to target architectures
+def : ProcNoItin<"generic", []>;
+
+def : ProcNoItin<"arm8", [ARMv4]>;
+def : ProcNoItin<"arm810", [ARMv4]>;
+def : ProcNoItin<"strongarm", [ARMv4]>;
+def : ProcNoItin<"strongarm110", [ARMv4]>;
+def : ProcNoItin<"strongarm1100", [ARMv4]>;
+def : ProcNoItin<"strongarm1110", [ARMv4]>;
+
+def : ProcNoItin<"arm7tdmi", [ARMv4t]>;
+def : ProcNoItin<"arm7tdmi-s", [ARMv4t]>;
+def : ProcNoItin<"arm710t", [ARMv4t]>;
+def : ProcNoItin<"arm720t", [ARMv4t]>;
+def : ProcNoItin<"arm9", [ARMv4t]>;
+def : ProcNoItin<"arm9tdmi", [ARMv4t]>;
+def : ProcNoItin<"arm920", [ARMv4t]>;
+def : ProcNoItin<"arm920t", [ARMv4t]>;
+def : ProcNoItin<"arm922t", [ARMv4t]>;
+def : ProcNoItin<"arm940t", [ARMv4t]>;
+def : ProcNoItin<"ep9312", [ARMv4t]>;
+
+def : ProcNoItin<"arm10tdmi", [ARMv5t]>;
+def : ProcNoItin<"arm1020t", [ARMv5t]>;
+
+def : ProcNoItin<"arm9e", [ARMv5te]>;
+def : ProcNoItin<"arm926ej-s", [ARMv5te]>;
+def : ProcNoItin<"arm946e-s", [ARMv5te]>;
+def : ProcNoItin<"arm966e-s", [ARMv5te]>;
+def : ProcNoItin<"arm968e-s", [ARMv5te]>;
+def : ProcNoItin<"arm10e", [ARMv5te]>;
+def : ProcNoItin<"arm1020e", [ARMv5te]>;
+def : ProcNoItin<"arm1022e", [ARMv5te]>;
+def : ProcNoItin<"xscale", [ARMv5te]>;
+def : ProcNoItin<"iwmmxt", [ARMv5te]>;
+
+def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>;
+def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>;
+def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>;
+
+def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>;
+def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"mpcorenovfp", ARMV6Itineraries, [ARMv6k]>;
+def : Processor<"mpcore", ARMV6Itineraries, [ARMv6k,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
+
+def : Processor<"arm1156t2-s", ARMV6Itineraries, [ARMv6t2]>;
+def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2,
+ FeatureVFP2,
+ FeatureHasSlowFPVMLx]>;
-// V4 Processors.
-def : ProcNoItin<"generic", []>;
-def : ProcNoItin<"arm8", []>;
-def : ProcNoItin<"arm810", []>;
-def : ProcNoItin<"strongarm", []>;
-def : ProcNoItin<"strongarm110", []>;
-def : ProcNoItin<"strongarm1100", []>;
-def : ProcNoItin<"strongarm1110", []>;
-
-// V4T Processors.
-def : ProcNoItin<"arm7tdmi", [HasV4TOps]>;
-def : ProcNoItin<"arm7tdmi-s", [HasV4TOps]>;
-def : ProcNoItin<"arm710t", [HasV4TOps]>;
-def : ProcNoItin<"arm720t", [HasV4TOps]>;
-def : ProcNoItin<"arm9", [HasV4TOps]>;
-def : ProcNoItin<"arm9tdmi", [HasV4TOps]>;
-def : ProcNoItin<"arm920", [HasV4TOps]>;
-def : ProcNoItin<"arm920t", [HasV4TOps]>;
-def : ProcNoItin<"arm922t", [HasV4TOps]>;
-def : ProcNoItin<"arm940t", [HasV4TOps]>;
-def : ProcNoItin<"ep9312", [HasV4TOps]>;
-
-// V5T Processors.
-def : ProcNoItin<"arm10tdmi", [HasV5TOps]>;
-def : ProcNoItin<"arm1020t", [HasV5TOps]>;
-
-// V5TE Processors.
-def : ProcNoItin<"arm9e", [HasV5TEOps]>;
-def : ProcNoItin<"arm926ej-s", [HasV5TEOps]>;
-def : ProcNoItin<"arm946e-s", [HasV5TEOps]>;
-def : ProcNoItin<"arm966e-s", [HasV5TEOps]>;
-def : ProcNoItin<"arm968e-s", [HasV5TEOps]>;
-def : ProcNoItin<"arm10e", [HasV5TEOps]>;
-def : ProcNoItin<"arm1020e", [HasV5TEOps]>;
-def : ProcNoItin<"arm1022e", [HasV5TEOps]>;
-def : ProcNoItin<"xscale", [HasV5TEOps]>;
-def : ProcNoItin<"iwmmxt", [HasV5TEOps]>;
-
-// V6 Processors.
-def : Processor<"arm1136j-s", ARMV6Itineraries, [HasV6Ops]>;
-def : Processor<"arm1136jf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
- FeatureHasSlowFPVMLx]>;
-
-// V6M Processors.
-def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
- FeatureDB, FeatureMClass]>;
-def : Processor<"cortex-m0plus", ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
- FeatureDB, FeatureMClass]>;
-def : Processor<"cortex-m1", ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
- FeatureDB, FeatureMClass]>;
-def : Processor<"sc000", ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
- FeatureDB, FeatureMClass]>;
-
-// V6K Processors.
-def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6KOps]>;
-def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6KOps, FeatureVFP2,
- FeatureHasSlowFPVMLx]>;
-def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6KOps]>;
-def : Processor<"mpcore", ARMV6Itineraries, [HasV6KOps, FeatureVFP2,
- FeatureHasSlowFPVMLx]>;
-
-// V6T2 Processors.
-def : Processor<"arm1156t2-s", ARMV6Itineraries, [HasV6T2Ops,
- FeatureDSPThumb2]>;
-def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
- FeatureHasSlowFPVMLx,
- FeatureDSPThumb2]>;
-
-// V7a Processors.
// FIXME: A5 has currently the same Schedule model as A8
-def : ProcessorModel<"cortex-a5", CortexA8Model,
- [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureVFP4, FeatureDSPThumb2,
- FeatureHasRAS, FeatureAClass]>;
-def : ProcessorModel<"cortex-a7", CortexA8Model,
- [ProcA7, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureHasRAS,
- FeatureAClass]>;
-def : ProcessorModel<"cortex-a8", CortexA8Model,
- [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureHasRAS,
- FeatureAClass]>;
-def : ProcessorModel<"cortex-a9", CortexA9Model,
- [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureHasRAS, FeatureMP,
- FeatureAClass]>;
+def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureMP,
+ FeatureVFP4]>;
+
+def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureMP,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureVirtualization]>;
+
+def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk]>;
+
+def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureFP16,
+ FeatureAvoidPartialCPSR,
+ FeatureMP]>;
// FIXME: A12 has currently the same Schedule model as A9
-def : ProcessorModel<"cortex-a12", CortexA9Model,
- [ProcA12, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureMP,
- FeatureHasRAS, FeatureAClass]>;
-
-// FIXME: A15 has currently the same ProcessorModel as A9.
-def : ProcessorModel<"cortex-a15", CortexA9Model,
- [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureHasRAS,
- FeatureAClass]>;
+def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization,
+ FeatureMP]>;
+
+// FIXME: A15 has currently the same Schedule model as A9.
+def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureT2XtPk,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization]>;
// FIXME: A17 has currently the same Schedule model as A9
-def : ProcessorModel<"cortex-a17", CortexA9Model,
- [ProcA17, HasV7Ops, FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureMP,
- FeatureHasRAS, FeatureAClass]>;
+def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17,
+ FeatureHasRAS,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureVirtualization]>;
// FIXME: krait has currently the same Schedule model as A9
-def : ProcessorModel<"krait", CortexA9Model,
- [ProcKrait, HasV7Ops,
- FeatureNEON, FeatureDB,
- FeatureDSPThumb2, FeatureHasRAS,
- FeatureAClass]>;
+// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
+// division features.
+def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait,
+ FeatureHasRAS,
+ FeatureVMLxForwarding,
+ FeatureT2XtPk,
+ FeatureFP16,
+ FeatureAvoidPartialCPSR,
+ FeatureVFP4,
+ FeatureHWDiv,
+ FeatureHWDivARM]>;
+
+def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift,
+ FeatureHasRAS,
+ FeatureNEONForFP,
+ FeatureT2XtPk,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureAvoidMOVsShOp,
+ FeatureHasSlowFPVMLx]>;
// FIXME: R4 has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r4", CortexA8Model,
- [ProcR4]>;
+def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4,
+ FeatureHasRAS,
+ FeatureAvoidPartialCPSR,
+ FeatureT2XtPk]>;
// FIXME: R4F has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r4f", CortexA8Model,
- [ProcR4,
- FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
- FeatureVFP3, FeatureD16]>;
+def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4,
+ FeatureHasRAS,
+ FeatureSlowFPBrcc,
+ FeatureHasSlowFPVMLx,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureAvoidPartialCPSR,
+ FeatureT2XtPk]>;
// FIXME: R5 has currently the same ProcessorModel as A8.
-def : ProcessorModel<"cortex-r5", CortexA8Model,
- [ProcR5, HasV7Ops, FeatureDB,
- FeatureVFP3, FeatureDSPThumb2,
- FeatureHasRAS,
- FeatureD16, FeatureRClass]>;
+def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
+ FeatureHasRAS,
+ FeatureVFP3,
+ FeatureD16,
+ FeatureSlowFPBrcc,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureAvoidPartialCPSR,
+ FeatureT2XtPk]>;
// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
-def : ProcessorModel<"cortex-r7", CortexA8Model,
- [ProcR5, HasV7Ops, FeatureDB,
- FeatureVFP3, FeatureDSPThumb2,
- FeatureHasRAS, FeatureVFPOnlySP,
- FeatureD16, FeatureMP, FeatureRClass]>;
-
-// V7M Processors.
-def : ProcNoItin<"cortex-m3", [HasV7Ops,
- FeatureThumb2, FeatureNoARM, FeatureDB,
- FeatureHWDiv, FeatureMClass]>;
-def : ProcNoItin<"sc300", [HasV7Ops,
- FeatureThumb2, FeatureNoARM, FeatureDB,
- FeatureHWDiv, FeatureMClass]>;
-
-// V7EM Processors.
-def : ProcNoItin<"cortex-m4", [HasV7Ops,
- FeatureThumb2, FeatureNoARM, FeatureDB,
- FeatureHWDiv, FeatureDSPThumb2,
- FeatureT2XtPk, FeatureVFP4,
- FeatureVFPOnlySP, FeatureD16,
- FeatureMClass]>;
-def : ProcNoItin<"cortex-m7", [HasV7Ops,
- FeatureThumb2, FeatureNoARM, FeatureDB,
- FeatureHWDiv, FeatureDSPThumb2,
- FeatureT2XtPk, FeatureFPARMv8,
- FeatureD16, FeatureMClass]>;
-
-
-// Swift uArch Processors.
-def : ProcessorModel<"swift", SwiftModel,
- [ProcSwift, HasV7Ops, FeatureNEON,
- FeatureDB, FeatureDSPThumb2,
- FeatureHasRAS, FeatureAClass]>;
-
-// V8 Processors
-def : ProcNoItin<"cortex-a53", [ProcA53, HasV8Ops, FeatureAClass,
- FeatureDB, FeatureFPARMv8,
- FeatureNEON, FeatureDSPThumb2]>;
-def : ProcNoItin<"cortex-a57", [ProcA57, HasV8Ops, FeatureAClass,
- FeatureDB, FeatureFPARMv8,
- FeatureNEON, FeatureDSPThumb2]>;
-// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
-def : ProcNoItin<"cortex-a72", [ProcA57, HasV8Ops, FeatureAClass,
- FeatureDB, FeatureFPARMv8,
- FeatureNEON, FeatureDSPThumb2]>;
+def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
+ FeatureHasRAS,
+ FeatureVFP3,
+ FeatureVFPOnlySP,
+ FeatureD16,
+ FeatureFP16,
+ FeatureMP,
+ FeatureSlowFPBrcc,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureAvoidPartialCPSR,
+ FeatureT2XtPk]>;
+
+def : ProcNoItin<"cortex-m3", [ARMv7m]>;
+def : ProcNoItin<"sc300", [ARMv7m]>;
+
+def : ProcNoItin<"cortex-m4", [ARMv7em,
+ FeatureVFP4,
+ FeatureVFPOnlySP,
+ FeatureD16]>;
+
+def : ProcNoItin<"cortex-m7", [ARMv7em,
+ FeatureFPARMv8,
+ FeatureD16]>;
+
+
+def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureCrypto,
+ FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureCrypto,
+ FeatureCRC]>;
// Cyclone is very similar to swift
-def : ProcessorModel<"cyclone", SwiftModel,
- [ProcSwift, HasV8Ops, HasV7Ops,
- FeatureCrypto, FeatureFPARMv8,
- FeatureDB,FeatureDSPThumb2,
- FeatureHasRAS, FeatureZCZeroing]>;
+def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
+ FeatureHasRAS,
+ FeatureNEONForFP,
+ FeatureT2XtPk,
+ FeatureVFP4,
+ FeatureMP,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureAvoidPartialCPSR,
+ FeatureAvoidMOVsShOp,
+ FeatureHasSlowFPVMLx,
+ FeatureCrypto,
+ FeatureZCZeroing]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
@@ -504,8 +677,15 @@ def ARMAsmWriter : AsmWriter {
bit isMCAsmWriter = 1;
}
+def ARMAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "ARM";
+ string BreakCharacters = ".";
+}
+
def ARM : Target {
// Pull in Instruction Info:
let InstructionSet = ARMInstrInfo;
let AssemblyWriters = [ARMAsmWriter];
+ let AssemblyParserVariants = [ARMAsmParserVariant];
}
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 738ddedccdac..206db9619a2f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -60,7 +60,7 @@ using namespace llvm;
ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
- InConstantPool(false) {}
+ InConstantPool(false), OptimizationGoals(-1) {}
void ARMAsmPrinter::EmitFunctionBodyEnd() {
// Make sure to terminate any constant pools that were at the end
@@ -80,8 +80,8 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
OutStreamer->EmitLabel(CurrentFnSym);
}
-void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
- uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+ uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
assert(Size && "C++ constructor pointer had zero size!");
const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
@@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<ARMSubtarget>();
SetupMachineFunction(MF);
+ const Function* F = MF.getFunction();
+ const TargetMachine& TM = MF.getTarget();
+
+ // Calculate this function's optimization goal.
+ unsigned OptimizationGoal;
+ if (F->hasFnAttribute(Attribute::OptimizeNone))
+ // For best debugging illusion, speed and small size sacrificed
+ OptimizationGoal = 6;
+ else if (F->optForMinSize())
+ // Aggressively for small size, speed and debug illusion sacrificed
+ OptimizationGoal = 4;
+ else if (F->optForSize())
+ // For small size, but speed and debugging illusion preserved
+ OptimizationGoal = 3;
+ else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+ // Aggressively for speed, small size and debug illusion sacrificed
+ OptimizationGoal = 2;
+ else if (TM.getOptLevel() > CodeGenOpt::None)
+ // For speed, but small size and good debug illusion preserved
+ OptimizationGoal = 1;
+ else // TM.getOptLevel() == CodeGenOpt::None
+ // For good debugging, but speed and small size preserved
+ OptimizationGoal = 5;
+
+ // Combine a new optimization goal with existing ones.
+ if (OptimizationGoals == -1) // uninitialized goals
+ OptimizationGoals = OptimizationGoal;
+ else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+ OptimizationGoals = 0;
if (Subtarget->isTargetCOFF()) {
- bool Internal = MF.getFunction()->hasInternalLinkage();
+ bool Internal = F->hasInternalLinkage();
COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
: COFF::IMAGE_SYM_CLASS_EXTERNAL;
int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
@@ -198,22 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
MCSymbol *ARMAsmPrinter::
GetARMJTIPICJumpTableLabel(unsigned uid) const {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
SmallString<60> Name;
- raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
<< getFunctionNumber() << '_' << uid;
return OutContext.getOrCreateSymbol(Name);
}
-
-MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
- const DataLayout *DL = TM.getDataLayout();
- SmallString<60> Name;
- raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
- << getFunctionNumber();
- return OutContext.getOrCreateSymbol(Name);
-}
-
bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &O) {
@@ -515,6 +535,17 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
// generates code that does this, it is always safe to set.
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
}
+
+ // The last attribute to be emitted is ABI_optimization_goals
+ MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+ ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+ if (OptimizationGoals > 0 &&
+ (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI()))
+ ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+ OptimizationGoals = -1;
+
+ ATS.finishAttributeSection();
}
//===----------------------------------------------------------------------===//
@@ -532,7 +563,7 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
if (Subtarget->hasV8Ops())
return ARMBuildAttrs::v8;
else if (Subtarget->hasV7Ops()) {
- if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+ if (Subtarget->isMClass() && Subtarget->hasDSP())
return ARMBuildAttrs::v7E_M;
return ARMBuildAttrs::v7;
} else if (Subtarget->hasV6T2Ops())
@@ -587,7 +618,7 @@ void ARMAsmPrinter::emitAttributes() {
// We consider krait as a "cortex-a9" + hwdiv CPU
// Enable hwdiv through ".arch_extension idiv"
if (STI.hasDivide() || STI.hasDivideInARMMode())
- ATS.emitArchExtension(ARM::AEK_HWDIV);
+ ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM);
} else
ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
}
@@ -807,8 +838,6 @@ void ARMAsmPrinter::emitAttributes() {
else if (STI.hasVirtualization())
ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
ARMBuildAttrs::AllowVirtualization);
-
- ATS.finishAttributeSection();
}
//===----------------------------------------------------------------------===//
@@ -828,8 +857,7 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
case ARMCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
case ARMCP::TPOFF: return MCSymbolRefExpr::VK_TPOFF;
case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_GOTTPOFF;
- case ARMCP::GOT: return MCSymbolRefExpr::VK_GOT;
- case ARMCP::GOTOFF: return MCSymbolRefExpr::VK_GOTOFF;
+ case ARMCP::GOT_PREL: return MCSymbolRefExpr::VK_ARM_GOT_PREL;
}
llvm_unreachable("Invalid ARMCPModifier!");
}
@@ -875,8 +903,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
void ARMAsmPrinter::
EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
- const DataLayout *DL = TM.getDataLayout();
- int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
+ const DataLayout &DL = getDataLayout();
+ int Size = DL.getTypeAllocSize(MCPV->getType());
ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
@@ -909,10 +937,9 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
OutContext);
if (ACPV->getPCAdjustment()) {
- MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(),
- getFunctionNumber(),
- ACPV->getLabelId(),
- OutContext);
+ MCSymbol *PCLabel =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ ACPV->getLabelId(), OutContext);
const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
PCRelExpr =
MCBinaryExpr::createAdd(PCRelExpr,
@@ -1136,6 +1163,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
Offset = 0;
break;
case ARM::ADDri:
+ case ARM::t2ADDri:
Offset = -MI->getOperand(2).getImm();
break;
case ARM::SUBri:
@@ -1198,7 +1226,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
#include "ARMGenMCPseudoLowering.inc"
void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// If we just ended a constant pool, mark it as such.
if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1355,9 +1383,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
- MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
- getFunctionNumber(),
- MI->getOperand(2).getImm(), OutContext);
+ MCSymbol *LabelSym =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(2).getImm(), OutContext);
const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
const MCExpr *PCRelExpr =
@@ -1388,9 +1416,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
- MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
- getFunctionNumber(),
- MI->getOperand(3).getImm(), OutContext);
+ MCSymbol *LabelSym =
+ getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+ MI->getOperand(3).getImm(), OutContext);
const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
const MCExpr *PCRelExpr =
@@ -1414,10 +1442,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// This adds the address of LPC0 to r0.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
- MI->getOperand(2).getImm(),
- OutContext));
+ MI->getOperand(2).getImm(), OutContext));
// Form and emit the add.
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
@@ -1436,10 +1463,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// This adds the address of LPC0 to r0.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
- MI->getOperand(2).getImm(),
- OutContext));
+ MI->getOperand(2).getImm(), OutContext));
// Form and emit the add.
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
@@ -1468,10 +1494,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// a PC-relative address at the ldr instruction.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
+ OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
- MI->getOperand(2).getImm(),
- OutContext));
+ MI->getOperand(2).getImm(), OutContext));
// Form and emit the load
unsigned Opcode;
@@ -1519,7 +1544,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (MCPE.isMachineConstantPoolEntry())
EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
else
- EmitGlobalConstant(MCPE.Val.ConstVal);
+ EmitGlobalConstant(DL, MCPE.Val.ConstVal);
return;
}
case ARM::JUMPTABLE_ADDRS:
@@ -1653,12 +1678,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// adds $val, #7
// str $val, [$src, #4]
// movs r0, #0
- // b 1f
+ // b LSJLJEH
// movs r0, #1
- // 1:
+ // LSJLJEH:
unsigned SrcReg = MI->getOperand(0).getReg();
unsigned ValReg = MI->getOperand(1).getReg();
- MCSymbol *Label = GetARMSJLJEHLabel();
+ MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
.addReg(ValReg)
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 3d251213f5bf..ed7be2de51ca 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
/// labels used for ARMv4t thumb code to make register indirect calls.
SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
+ /// OptimizationGoals - Maintain a combined optimization goal for all
+ /// functions in a module: one of Tag_ABI_optimization_goals values,
+ /// -1 if uninitialized, 0 if conflicting goals
+ int OptimizationGoals;
+
public:
explicit ARMAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
@@ -84,7 +89,7 @@ public:
void EmitFunctionEntryLabel() override;
void EmitStartOfAsmFile(Module &M) override;
void EmitEndOfAsmFile(Module &M) override;
- void EmitXXStructor(const Constant *CV) override;
+ void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
// lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
@@ -119,8 +124,6 @@ private:
MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const;
- MCSymbol *GetARMSJLJEHLabel() const;
-
MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
public:
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9f43e732bd73..49f328852667 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -97,7 +97,7 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
Subtarget(STI) {
for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
- assert(false && "Duplicated entries?");
+ llvm_unreachable("Duplicated entries?");
MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
}
@@ -440,7 +440,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
if (MI->isBundle()) {
- MachineBasicBlock::const_instr_iterator I = MI;
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
while (++I != E && I->isInsideBundle()) {
int PIdx = I->findFirstPredOperandIdx();
@@ -518,7 +518,7 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
static bool isCPSRDefined(const MachineInstr *MI) {
for (const auto &MO : MI->operands())
- if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef())
+ if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
return true;
return false;
}
@@ -647,7 +647,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
unsigned Size = 0;
- MachineBasicBlock::const_instr_iterator I = MI;
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
while (++I != E && I->isInsideBundle()) {
assert(!I->isBundle() && "No nested bundle!");
@@ -853,11 +853,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FI),
- Align);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), Align);
switch (RC->getSize()) {
case 4:
@@ -1043,12 +1041,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI),
- Align);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), Align);
switch (RC->getSize()) {
case 4:
@@ -1224,6 +1219,60 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
}
+/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// depending on whether the result is used.
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const {
+ bool isThumb1 = Subtarget.isThumb1Only();
+ bool isThumb2 = Subtarget.isThumb2();
+ const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
+
+ MachineInstr *MI = MBBI;
+ DebugLoc dl = MI->getDebugLoc();
+ MachineBasicBlock *BB = MI->getParent();
+
+ MachineInstrBuilder LDM, STM;
+ if (isThumb1 || !MI->getOperand(1).isDead()) {
+ LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
+ : isThumb1 ? ARM::tLDMIA_UPD
+ : ARM::LDMIA_UPD))
+ .addOperand(MI->getOperand(1));
+ } else {
+ LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
+ }
+
+ if (isThumb1 || !MI->getOperand(0).isDead()) {
+ STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
+ : isThumb1 ? ARM::tSTMIA_UPD
+ : ARM::STMIA_UPD))
+ .addOperand(MI->getOperand(0));
+ } else {
+ STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
+ }
+
+ AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
+ AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+
+ // Sort the scratch registers into ascending order.
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ llvm::SmallVector<unsigned, 6> ScratchRegs;
+ for(unsigned I = 5; I < MI->getNumOperands(); ++I)
+ ScratchRegs.push_back(MI->getOperand(I).getReg());
+ std::sort(ScratchRegs.begin(), ScratchRegs.end(),
+ [&TRI](const unsigned &Reg1,
+ const unsigned &Reg2) -> bool {
+ return TRI.getEncodingValue(Reg1) <
+ TRI.getEncodingValue(Reg2);
+ });
+
+ for (const auto &Reg : ScratchRegs) {
+ LDM.addReg(Reg, RegState::Define);
+ STM.addReg(Reg, RegState::Kill);
+ }
+
+ BB->erase(MBBI);
+}
+
+
bool
ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MachineFunction &MF = *MI->getParent()->getParent();
@@ -1237,6 +1286,11 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return true;
}
+ if (MI->getOpcode() == ARM::MEMCPY) {
+ expandMEMCPY(MI);
+ return true;
+ }
+
// This hook gets to expand COPY instructions before they become
// copyPhysReg() calls. Look for VMOVS instructions that can legally be
// widened to VMOVD. We prefer the VMOVD when possible because it may be
@@ -1325,9 +1379,9 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
// instructions, so that's probably OK, but is PIC always correct when
// we get here?
if (ACPV->isGlobalValue())
- NewCPV = ARMConstantPoolConstant::
- Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
- ARMCP::CPValue, 4);
+ NewCPV = ARMConstantPoolConstant::Create(
+ cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue,
+ 4, ACPV->getModifier(), ACPV->mustAddCurrentAddress());
else if (ACPV->isExtSymbol())
NewCPV = ARMConstantPoolSymbol::
Create(MF.getFunction()->getContext(),
@@ -1645,16 +1699,14 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
bool ARMBaseInstrInfo::
isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCycles, unsigned ExtraPredCycles,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
if (!NumCycles)
return false;
// If we are optimizing for size, see if the branch in the predecessor can be
// lowered to cbn?z by the constant island lowering pass, and return false if
// so. This results in a shorter instruction sequence.
- const Function *F = MBB.getParent()->getFunction();
- if (F->hasFnAttribute(Attribute::OptimizeForSize) ||
- F->hasFnAttribute(Attribute::MinSize)) {
+ if (MBB.getParent()->getFunction()->optForSize()) {
MachineBasicBlock *Pred = *MBB.pred_begin();
if (!Pred->empty()) {
MachineInstr *LastMI = &*Pred->rbegin();
@@ -1677,12 +1729,14 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
}
// Attempt to estimate the relative costs of predication versus branching.
- unsigned UnpredCost = Probability.getNumerator() * NumCycles;
- UnpredCost /= Probability.getDenominator();
- UnpredCost += 1; // The branch itself
- UnpredCost += Subtarget.getMispredictionPenalty() / 10;
-
- return (NumCycles + ExtraPredCycles) <= UnpredCost;
+ // Here we scale up each component of UnpredCost to avoid precision issue when
+ // scaling NumCycles by Probability.
+ const unsigned ScalingUpFactor = 1024;
+ unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
+ UnpredCost += ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+ return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
}
bool ARMBaseInstrInfo::
@@ -1690,23 +1744,22 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned TCycles, unsigned TExtra,
MachineBasicBlock &FMBB,
unsigned FCycles, unsigned FExtra,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
if (!TCycles || !FCycles)
return false;
// Attempt to estimate the relative costs of predication versus branching.
- unsigned TUnpredCost = Probability.getNumerator() * TCycles;
- TUnpredCost /= Probability.getDenominator();
-
- uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
- unsigned FUnpredCost = Comp * FCycles;
- FUnpredCost /= Probability.getDenominator();
-
+ // Here we scale up each component of UnpredCost to avoid precision issue when
+ // scaling TCycles/FCycles by Probability.
+ const unsigned ScalingUpFactor = 1024;
+ unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+ unsigned FUnpredCost =
+ Probability.getCompl().scale(FCycles * ScalingUpFactor);
unsigned UnpredCost = TUnpredCost + FUnpredCost;
- UnpredCost += 1; // The branch itself
- UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+ UnpredCost += 1 * ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
- return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
+ return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
}
bool
@@ -1744,9 +1797,10 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
llvm_unreachable("Unknown unconditional branch opcode!");
}
-/// commuteInstruction - Handle commutable instructions.
-MachineInstr *
-ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
switch (MI->getOpcode()) {
case ARM::MOVCCr:
case ARM::t2MOVCCr: {
@@ -1756,7 +1810,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
// MOVCC AL can't be inverted. Shouldn't happen.
if (CC == ARMCC::AL || PredReg != ARM::CPSR)
return nullptr;
- MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+ MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
if (!MI)
return nullptr;
// After swapping the MOVCC operands, also invert the condition.
@@ -1765,7 +1819,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
return MI;
}
}
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
/// Identify instructions that can be folded into a MOVCC instruction, and
@@ -1975,21 +2029,12 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
}
}
-static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI,
- MachineInstr *MI) {
- for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true);
- Subreg.isValid(); ++Subreg)
- if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) !=
- MachineBasicBlock::LQR_Dead)
- return true;
- return false;
-}
bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
MachineFunction &MF, MachineInstr *MI,
unsigned NumBytes) {
// This optimisation potentially adds lots of load and store
// micro-operations, it's only really a great benefit to code-size.
- if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+ if (!MF.getFunction()->optForMinSize())
return false;
// If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -2058,11 +2103,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
// registers live within the function we might clobber a return value
// register; the other way a register can be live here is if it's
// callee-saved.
- // TODO: Currently, computeRegisterLiveness() does not report "live" if a
- // sub reg is live. When computeRegisterLiveness() works for sub reg, it
- // can replace isAnySubRegLive().
if (isCalleeSavedRegister(CurReg, CSRegs) ||
- isAnySubRegLive(CurReg, TRI, MI)) {
+ MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) !=
+ MachineBasicBlock::LQR_Dead) {
// VFP pops don't allow holes in the register list, so any skip is fatal
// for our transformation. GPR pops do, so we should just keep looking.
if (IsVFPPushPop)
@@ -3381,7 +3424,7 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
assert(Idx != -1 && "Cannot find bundled definition!");
DefIdx = Idx;
- return II;
+ return &*II;
}
static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
@@ -3389,7 +3432,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
unsigned &UseIdx, unsigned &Dist) {
Dist = 0;
- MachineBasicBlock::const_instr_iterator II = MI; ++II;
+ MachineBasicBlock::const_instr_iterator II = ++MI->getIterator();
assert(II->isInsideBundle() && "Empty bundle?");
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -3410,7 +3453,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
}
UseIdx = Idx;
- return II;
+ return &*II;
}
/// Return the number of cycles to add to (or subtract from) the static
@@ -3652,6 +3695,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
// instructions).
if (Latency > 0 && Subtarget.isThumb2()) {
const MachineFunction *MF = DefMI->getParent()->getParent();
+ // FIXME: Use Function::optForSize().
if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
--Latency;
}
@@ -3931,11 +3975,11 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
// other passes may query the latency of a bundled instruction.
if (MI->isBundle()) {
unsigned Latency = 0;
- MachineBasicBlock::const_instr_iterator I = MI;
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
while (++I != E && I->isInsideBundle()) {
if (I->getOpcode() != ARM::t2IT)
- Latency += getInstrLatency(ItinData, I, PredCost);
+ Latency += getInstrLatency(ItinData, &*I, PredCost);
}
return Latency;
}
@@ -4054,8 +4098,8 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
MIB.addReg(Reg, RegState::Kill).addImm(0);
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
- MachineMemOperand *MMO = MBB.getParent()->
- getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 4, 4);
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
MIB.addMemOperand(MMO);
AddDefaultPred(MIB);
}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b4706e348933..d80c49494c77 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -86,6 +86,18 @@ protected:
RegSubRegPair &BaseReg,
RegSubRegPairAndIdx &InsertedReg) const override;
+ /// Commutes the operands in the given instruction.
+ /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+ ///
+ /// Do not call this method for a non-commutable instruction or for
+ /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
public:
// Return whether the target has an explicit NOP encoding.
bool hasNOP() const;
@@ -188,9 +200,6 @@ public:
MachineInstr *duplicate(MachineInstr *Orig,
MachineFunction &MF) const override;
- MachineInstr *commuteInstruction(MachineInstr*,
- bool=false) const override;
-
const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
unsigned SubIdx, unsigned State,
const TargetRegisterInfo *TRI) const;
@@ -224,15 +233,15 @@ public:
bool isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCycles, unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
unsigned ExtraT, MachineBasicBlock &FMBB,
unsigned NumF, unsigned ExtraF,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
- const BranchProbability &Probability) const override {
+ BranchProbability Probability) const override {
return NumCycles == 1;
}
@@ -343,6 +352,8 @@ private:
virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI,
Reloc::Model RM) const = 0;
+ void expandMEMCPY(MachineBasicBlock::iterator) const;
+
private:
/// Modeling special VFP / NEON fp MLA / MLS hazards.
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e7d5be7753e4..419717c85a79 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -225,7 +225,8 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
const MachineFunction &MF,
- const VirtRegMap *VRM) const {
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
@@ -338,7 +339,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
// 1. Dynamic stack realignment is explicitly disabled,
// 2. This is a Thumb1 function (it's not useful, so we don't bother), or
// 3. There are VLAs in the function and the base pointer is disabled.
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+ if (!TargetRegisterInfo::canRealignStack(MF))
return false;
if (AFI->isThumb1OnlyFunction())
return false;
@@ -356,18 +357,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
}
bool ARMBaseRegisterInfo::
-needsStackRealignment(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const ARMFrameLowering *TFI = getFrameLowering(MF);
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
- F->hasFnAttribute(Attribute::StackAlignment));
-
- return requiresRealignment && canRealignStack(MF);
-}
-
-bool ARMBaseRegisterInfo::
cannotEliminateFrame(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index fdc1ef9432c8..cea8b80c7821 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -94,7 +94,7 @@ public:
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
- const uint32_t *getNoPreservedMask() const;
+ const uint32_t *getNoPreservedMask() const override;
/// getThisReturnPreservedMask - Returns a call preserved mask specific to the
/// case that 'returned' is on an i32 first argument if the calling convention
@@ -126,15 +126,15 @@ public:
ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
const MachineFunction &MF,
- const VirtRegMap *VRM) const override;
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
void updateRegAllocHint(unsigned Reg, unsigned NewReg,
MachineFunction &MF) const override;
bool hasBasePointer(const MachineFunction &MF) const;
- bool canRealignStack(const MachineFunction &MF) const;
- bool needsStackRealignment(const MachineFunction &MF) const override;
+ bool canRealignStack(const MachineFunction &MF) const override;
int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const override;
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index d687568d7eb9..a731d00883a1 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
State);
}
-static const uint16_t RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-static const uint16_t SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
- ARM::S4, ARM::S5, ARM::S6, ARM::S7,
- ARM::S8, ARM::S9, ARM::S10, ARM::S11,
- ARM::S12, ARM::S13, ARM::S14, ARM::S15 };
-static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
- ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
-static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+ ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+ ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+ ARM::S12, ARM::S13, ARM::S14, ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
@@ -199,9 +199,11 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
- unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
+ auto &DL = State.getMachineFunction().getDataLayout();
+ unsigned StackAlign = DL.getStackAlignment();
+ unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
- ArrayRef<uint16_t> RegList;
+ ArrayRef<MCPhysReg> RegList;
switch (LocVT.SimpleTy) {
case MVT::i32: {
RegList = RRegList;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 27cf06b995a0..233516415149 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -125,6 +125,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[v2f64], CCIfAlign<"16",
+ CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
]>;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index f4ec8c67c977..e89757c19ecc 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -342,7 +342,7 @@ void ARMConstantIslands::verify() {
#ifndef NDEBUG
for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
MBBI != E; ++MBBI) {
- MachineBasicBlock *MBB = MBBI;
+ MachineBasicBlock *MBB = &*MBBI;
unsigned MBBId = MBB->getNumber();
assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
}
@@ -542,7 +542,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// identity mapping of CPI's to CPE's.
const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
- const DataLayout &TD = *MF->getTarget().getDataLayout();
+ const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
assert(Size >= 4 && "Too small constant pool entry");
@@ -589,6 +589,8 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
for (MachineBasicBlock &MBB : *MF) {
auto MI = MBB.getLastNonDebugInstr();
+ if (MI == MBB.end())
+ continue;
unsigned JTOpcode;
switch (MI->getOpcode()) {
@@ -639,12 +641,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
/// into the block immediately after it.
bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
// Get the next machine basic block in the function.
- MachineFunction::iterator MBBI = MBB;
+ MachineFunction::iterator MBBI = MBB->getIterator();
// Can't fall off end of function.
if (std::next(MBBI) == MBB->getParent()->end())
return false;
- MachineBasicBlock *NextBB = std::next(MBBI);
+ MachineBasicBlock *NextBB = &*std::next(MBBI);
if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end())
return false;
@@ -722,15 +724,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
// has any inline assembly in it. If so, we have to be conservative about
// alignment assumptions, as we don't know for sure the size of any
// instructions in the inline assembly.
- for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
- computeBlockSize(I);
+ for (MachineBasicBlock &MBB : *MF)
+ computeBlockSize(&MBB);
// The known bits of the entry block offset are determined by the function
// alignment.
BBInfo.front().KnownBits = MF->getAlignment();
// Compute block offsets and known bits.
- adjustBBOffsetsAfter(MF->begin());
+ adjustBBOffsetsAfter(&MF->front());
// Now go back through the instructions and build up our data structures.
for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
@@ -968,7 +970,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
- MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+ MachineFunction::iterator MBBI = ++OrigBB->getIterator();
MF->insert(MBBI, NewBB);
// Splice the instructions starting with MI over to NewBB.
@@ -1088,7 +1090,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
unsigned NextBlockOffset, NextBlockAlignment;
- MachineFunction::const_iterator NextBlock = Water;
+ MachineFunction::const_iterator NextBlock = Water->getIterator();
if (++NextBlock == MF->end()) {
NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
NextBlockAlignment = 0;
@@ -1350,7 +1352,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
<< format(", expected CPE offset %#x\n", CPEOffset));
- NewMBB = std::next(MachineFunction::iterator(UserMBB));
+ NewMBB = &*++UserMBB->getIterator();
// Add an unconditional branch from UserMBB to fallthrough block. Record
// it for branch lengthening; this new branch will not get out of range,
// but if the preceding conditional branch is out of range, the targets
@@ -1503,8 +1505,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
NewWaterList.insert(NewIsland);
// The new CPE goes before the following block (NewMBB).
- NewMBB = std::next(MachineFunction::iterator(WaterBB));
-
+ NewMBB = &*++WaterBB->getIterator();
} else {
// No water found.
DEBUG(dbgs() << "No water found\n");
@@ -1515,7 +1516,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
// next iteration for constant pools, but in this context, we don't want
// it. Check for this so it will be removed from the WaterList.
// Also remove any entry from NewWaterList.
- MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+ MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
if (IP != WaterList.end())
NewWaterList.erase(WaterBB);
@@ -1532,7 +1533,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
WaterList.erase(IP);
// Okay, we know we can put an island before NewMBB now, do it!
- MF->insert(NewMBB, NewIsland);
+ MF->insert(NewMBB->getIterator(), NewIsland);
// Update internal data structures to account for the newly inserted MBB.
updateForInsertedWaterBlock(NewIsland);
@@ -1553,7 +1554,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
// Increase the size of the island block to account for the new entry.
BBInfo[NewIsland->getNumber()].Size += Size;
- adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
+ adjustBBOffsetsAfter(&*--NewIsland->getIterator());
// Finally, change the CPI in the instruction operand to be ID.
for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1732,7 +1733,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
MBB->back().eraseFromParent();
// BBInfo[SplitBB].Offset is wrong temporarily, fixed below
}
- MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+ MachineBasicBlock *NextBB = &*++MBB->getIterator();
DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
<< " also invert condition and change dest. to BB#"
@@ -2058,9 +2059,9 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
/// \brief Returns whether CPEMI is the first instruction in the block
/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
/// we can switch the first register to PC and usually remove the address
-/// calculation that preceeded it.
+/// calculation that preceded it.
static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
- MachineFunction::iterator MBB = JTMI->getParent();
+ MachineFunction::iterator MBB = JTMI->getParent()->getIterator();
MachineFunction *MF = MBB->getParent();
++MBB;
@@ -2235,7 +2236,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
SmallVector<MachineOperand, 4> CondPrior;
- MachineFunction::iterator BBi = BB;
+ MachineFunction::iterator BBi = BB->getIterator();
MachineFunction::iterator OldPrior = std::prev(BBi);
// If the block terminator isn't analyzable, don't try to move the block
@@ -2258,7 +2259,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
// Create a new MBB for the code after the jump BB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
- MachineFunction::iterator MBBI = JTBB; ++MBBI;
+ MachineFunction::iterator MBBI = ++JTBB->getIterator();
MF->insert(MBBI, NewBB);
// Add an unconditional branch from NewBB to BB.
@@ -2273,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
// Update the CFG.
NewBB->addSuccessor(BB);
- JTBB->removeSuccessor(BB);
- JTBB->addSuccessor(NewBB);
+ JTBB->replaceSuccessor(BB, NewBB);
++NumJTInserted;
return NewBB;
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 7d41c69f08b8..c9849b2605ea 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -52,8 +52,7 @@ const char *ARMConstantPoolValue::getModifierText() const {
// strings if that's legal.
case ARMCP::no_modifier: return "none";
case ARMCP::TLSGD: return "tlsgd";
- case ARMCP::GOT: return "GOT";
- case ARMCP::GOTOFF: return "GOTOFF";
+ case ARMCP::GOT_PREL: return "GOT_PREL";
case ARMCP::GOTTPOFF: return "gottpoff";
case ARMCP::TPOFF: return "tpoff";
}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 36f63e239a9e..6b18a4e52878 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -39,8 +39,7 @@ namespace ARMCP {
enum ARMCPModifier {
no_modifier,
TLSGD,
- GOT,
- GOTOFF,
+ GOT_PREL,
GOTTPOFF,
TPOFF
};
@@ -103,8 +102,6 @@ public:
bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
- unsigned getRelocationInfo() const override { return 2; }
-
int getExistingMachineCPValue(MachineConstantPool *CP,
unsigned Alignment) override;
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 4438f50758dc..56f3498e1204 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -330,22 +330,19 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
/// load or store pseudo instruction.
static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
- const unsigned NumEntries = array_lengthof(NEONLdStTable);
-
#ifndef NDEBUG
// Make sure the table is sorted.
static bool TableChecked = false;
if (!TableChecked) {
- for (unsigned i = 0; i != NumEntries-1; ++i)
- assert(NEONLdStTable[i] < NEONLdStTable[i+1] &&
- "NEONLdStTable is not sorted!");
+ assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
+ "NEONLdStTable is not sorted!");
TableChecked = true;
}
#endif
- const NEONLdStTableEntry *I =
- std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
- if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
+ auto I = std::lower_bound(std::begin(NEONLdStTable),
+ std::end(NEONLdStTable), Opcode);
+ if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
return I;
return nullptr;
}
@@ -734,7 +731,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
HI16.addImm(Pred).addReg(PredReg);
if (RequiresBundling)
- finalizeBundle(MBB, &*LO16, &*MBBI);
+ finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator());
TransferImpOps(MI, LO16, HI16);
MI.eraseFromParent();
@@ -747,6 +744,55 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
switch (Opcode) {
default:
return false;
+
+ case ARM::TCRETURNdi:
+ case ARM::TCRETURNri: {
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->isReturn() &&
+ "Can only insert epilog into returning blocks");
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc dl = MBBI->getDebugLoc();
+ const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+ MBB.getParent()->getSubtarget().getInstrInfo());
+
+ // Tail call return: adjust the stack pointer and jump to callee.
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+ // Jump to label or value in register.
+ if (RetOpcode == ARM::TCRETURNdi) {
+ unsigned TCOpcode =
+ STI->isThumb()
+ ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
+ : ARM::TAILJMPd;
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+ if (JumpTarget.isGlobal())
+ MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ else {
+ assert(JumpTarget.isSymbol());
+ MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+ JumpTarget.getTargetFlags());
+ }
+
+ // Add the default predicate in Thumb mode.
+ if (STI->isThumb())
+ MIB.addImm(ARMCC::AL).addReg(0);
+ } else if (RetOpcode == ARM::TCRETURNri) {
+ BuildMI(MBB, MBBI, dl,
+ TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
+ .addReg(JumpTarget.getReg(), RegState::Kill);
+ }
+
+ MachineInstr *NewMI = std::prev(MBBI);
+ for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+ NewMI->addOperand(MBBI->getOperand(i));
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ MBBI = NewMI;
+ return true;
+ }
case ARM::VMOVScc:
case ARM::VMOVDcc: {
unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index fdd0763ea608..9bdf823c85bd 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -922,12 +922,9 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
if (Addr.BaseType == Address::FrameIndexBase) {
int FI = Addr.Base.FI;
int Offset = Addr.Offset;
- MachineMemOperand *MMO =
- FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI, Offset),
- Flags,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
// Now add the rest of the operands.
MIB.addFrameIndex(FI);
@@ -1278,8 +1275,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
.addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
- fastEmitBranch(FBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1303,8 +1299,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
.addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
- fastEmitBranch(FBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
} else if (const ConstantInt *CI =
@@ -1341,8 +1336,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
.addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
- fastEmitBranch(FBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -1355,8 +1349,8 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
TII.get(Opc)).addReg(AddrReg));
const IndirectBrInst *IB = cast<IndirectBrInst>(I);
- for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
- FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+ for (const BasicBlock *SuccBB : IB->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
return true;
}
@@ -1860,8 +1854,9 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
else
return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
- } else
- return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ } else {
+ return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+ }
case CallingConv::ARM_AAPCS_VFP:
if (!isVarArg)
return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
@@ -2944,48 +2939,51 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
unsigned Align, MVT VT) {
- bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
- ARMConstantPoolConstant *CPV =
- ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
- unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+ bool UseGOT_PREL =
+ !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+
+ LLVMContext *Context = &MF->getFunction()->getContext();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+ GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+ UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+ /*AddCurrentAddress=*/UseGOT_PREL);
+
+ unsigned ConstAlign =
+ MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+ unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+
+ unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+ unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
+ .addConstantPoolIndex(Idx);
+ if (Opc == ARM::LDRcp)
+ MIB.addImm(0);
+ AddDefaultPred(MIB);
- unsigned Opc;
- unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
- // Load value.
- if (isThumb2) {
- DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
- AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(ARM::t2LDRpci), DestReg1)
- .addConstantPoolIndex(Idx));
- Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
- } else {
- // The extra immediate is for addrmode2.
- DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
- AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DbgLoc, TII.get(ARM::LDRcp), DestReg1)
- .addConstantPoolIndex(Idx).addImm(0));
- Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
- }
+ // Fix the address by adding pc.
+ unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+ Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR
+ : ARM::PICADD;
+ DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+ .addReg(TempReg)
+ .addImm(ARMPCLabelIndex);
+ if (!Subtarget->isThumb())
+ AddDefaultPred(MIB);
- unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
- if (GlobalBaseReg == 0) {
- GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT));
- AFI->setGlobalBaseReg(GlobalBaseReg);
+ if (UseGOT_PREL && Subtarget->isThumb()) {
+ unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(ARM::t2LDRi12), NewDestReg)
+ .addReg(DestReg)
+ .addImm(0);
+ DestReg = NewDestReg;
+ AddOptionalDefs(MIB);
}
-
- unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
- DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
- DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
- GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
- MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
- DbgLoc, TII.get(Opc), DestReg2)
- .addReg(DestReg1)
- .addReg(GlobalBaseReg);
- if (!UseGOTOFF)
- MIB.addImm(0);
- AddOptionalDefs(MIB);
-
- return DestReg2;
+ return DestReg;
}
bool ARMFastISel::fastLowerArguments() {
@@ -3038,7 +3036,7 @@ bool ARMFastISel::fastLowerArguments() {
}
- static const uint16_t GPRArgRegs[] = {
+ static const MCPhysReg GPRArgRegs[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3
};
@@ -3055,7 +3053,7 @@ bool ARMFastISel::fastLowerArguments() {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY),
ResultReg).addReg(DstReg, getKillRegState(true));
- updateValueMap(I, ResultReg);
+ updateValueMap(&*I, ResultReg);
}
return true;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 6744000afe2b..c5990bb7d1fb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCContext.h"
@@ -58,7 +59,7 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
// iOS requires FP not to be clobbered for backtracing purpose.
- if (STI.isTargetIOS())
+ if (STI.isTargetIOS() || STI.isTargetWatchOS())
return true;
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -288,7 +289,6 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
void ARMFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented");
MachineBasicBlock::iterator MBBI = MBB.begin();
MachineFrameInfo *MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -305,7 +305,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
unsigned NumBytes = MFI->getStackSize();
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
unsigned FramePtr = RegInfo->getFrameRegister(MF);
// Determine the sizes of each callee-save spill areas and record which frame
@@ -489,7 +493,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
if (NumBytes) {
// Adjust SP after all the callee-save spills.
- if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
+ if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
+ tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
else {
emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
@@ -689,60 +694,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
AFI->setShouldRestoreSPFromFP(true);
}
-// Resolve TCReturn pseudo-instruction
-void ARMFrameLowering::fixTCReturn(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
- unsigned RetOpcode = MBBI->getOpcode();
- DebugLoc dl = MBBI->getDebugLoc();
- const ARMBaseInstrInfo &TII =
- *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-
- if (!(RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri))
- return;
-
- // Tail call return: adjust the stack pointer and jump to callee.
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
-
- // Jump to label or value in register.
- if (RetOpcode == ARM::TCRETURNdi) {
- unsigned TCOpcode = STI.isThumb() ?
- (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
- ARM::TAILJMPd;
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
- if (JumpTarget.isGlobal())
- MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
- JumpTarget.getTargetFlags());
- else {
- assert(JumpTarget.isSymbol());
- MIB.addExternalSymbol(JumpTarget.getSymbolName(),
- JumpTarget.getTargetFlags());
- }
-
- // Add the default predicate in Thumb mode.
- if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
- } else if (RetOpcode == ARM::TCRETURNri) {
- BuildMI(MBB, MBBI, dl,
- TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
- addReg(JumpTarget.getReg(), RegState::Kill);
- }
-
- MachineInstr *NewMI = std::prev(MBBI);
- for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
- NewMI->addOperand(MBBI->getOperand(i));
-
- // Delete the pseudo instruction TCRETURN.
- MBB.erase(MBBI);
- MBBI = NewMI;
-}
-
void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
- DebugLoc dl = MBBI->getDebugLoc();
MachineFrameInfo *MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
@@ -758,10 +711,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
- if (MF.getFunction()->getCallingConv() == CallingConv::GHC) {
- fixTCReturn(MF, MBB);
+ if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
return;
- }
+
+ // First put ourselves on the first (from top) terminator instructions.
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0)
@@ -840,8 +795,6 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
}
- fixTCReturn(MF, MBB);
-
if (ArgRegsSaveSize)
emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
}
@@ -932,12 +885,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
return Offset;
}
-int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
- unsigned FrameReg;
- return getFrameIndexReference(MF, FI, FrameReg);
-}
-
void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -950,7 +897,6 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
DebugLoc DL;
- if (MI != MBB.end()) DL = MI->getDebugLoc();
SmallVector<std::pair<unsigned,bool>, 4> Regs;
unsigned i = CSI.size();
@@ -1008,7 +954,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
// Put any subsequent vpush instructions before this one: they will refer to
// higher register numbers so need to be pushed first in order to preserve
// monotonicity.
- --MI;
+ if (MI != MBB.begin())
+ --MI;
}
}
@@ -1022,12 +969,20 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- DebugLoc DL = MI->getDebugLoc();
- unsigned RetOpcode = MI->getOpcode();
- bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
- RetOpcode == ARM::TCRETURNri);
- bool isInterrupt =
- RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+ DebugLoc DL;
+ bool isTailCall = false;
+ bool isInterrupt = false;
+ bool isTrap = false;
+ if (MBB.end() != MI) {
+ DL = MI->getDebugLoc();
+ unsigned RetOpcode = MI->getOpcode();
+ isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri);
+ isInterrupt =
+ RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+ isTrap =
+ RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
+ RetOpcode == ARM::tTRAP;
+ }
SmallVector<unsigned, 4> Regs;
unsigned i = CSI.size();
@@ -1043,11 +998,14 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
continue;
if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
- STI.hasV5TOps()) {
- Reg = ARM::PC;
- LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+ !isTrap && STI.hasV5TOps()) {
+ if (MBB.succ_empty()) {
+ Reg = ARM::PC;
+ DeleteRet = true;
+ LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+ } else
+ LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
// Fold the return instruction into the LDM.
- DeleteRet = true;
}
// If NoGap is true, pop consecutive registers and then leave the rest
@@ -1068,7 +1026,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
.addReg(ARM::SP));
for (unsigned i = 0, e = Regs.size(); i < e; ++i)
MIB.addReg(Regs[i], getDefRegState(true));
- if (DeleteRet) {
+ if (DeleteRet && MI != MBB.end()) {
MIB.copyImplicitOps(&*MI);
MI->eraseFromParent();
}
@@ -1095,7 +1053,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
// Put any subsequent vpop instructions after this one: they will refer to
// higher register numbers so need to be popped afterwards.
- ++MI;
+ if (MI != MBB.end())
+ ++MI;
}
}
@@ -1109,7 +1068,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) {
MachineFunction &MF = *MBB.getParent();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
MachineFrameInfo &MFI = *MF.getFrameInfo();
@@ -1118,7 +1077,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
// slot offsets can be wrong. The offset for d8 will always be correct.
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned DNum = CSI[i].getReg() - ARM::D8;
- if (DNum >= 8)
+ if (DNum > NumAlignedDPRCS2Regs - 1)
continue;
int FI = CSI[i].getFrameIdx();
// The even-numbered registers will be 16-byte aligned, the odd-numbered
@@ -1269,7 +1228,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) {
MachineFunction &MF = *MBB.getParent();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
// Find the frame index assigned to d8.
@@ -1654,13 +1613,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// FIXME: We could add logic to be more precise about negative offsets
// and which instructions will need a scratch register for them. Is it
// worth the effort and added fragility?
- bool BigStack =
- (RS &&
- (MFI->estimateStackSize(MF) +
- ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
- estimateRSStackSizeLimit(MF, this)))
- || MFI->hasVarSizedObjects()
- || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
+ bool BigStack = (RS && (MFI->estimateStackSize(MF) +
+ ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >=
+ estimateRSStackSizeLimit(MF, this))) ||
+ MFI->hasVarSizedObjects() ||
+ (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
bool ExtraCSSpill = false;
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
@@ -1698,8 +1655,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
unsigned Reg = UnspilledCS1GPRs[i];
- // Don't spill high register if the function is thumb
+ // Don't spill high register if the function is thumb. In the case of
+ // Windows on ARM, accept R11 (frame pointer)
if (!AFI->isThumbFunction() ||
+ (STI.isTargetWindows() && Reg == ARM::R11) ||
isARMLowRegister(Reg) || Reg == ARM::LR) {
SavedRegs.set(Reg);
if (!MRI.isReserved(Reg))
@@ -1784,8 +1743,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
- unsigned Align = getStackAlignment();
- Amount = (Amount+Align-1)/Align*Align;
+ Amount = alignSPAdjust(Amount);
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
assert(!AFI->isThumb1OnlyFunction() &&
@@ -1885,7 +1843,6 @@ void ARMFrameLowering::adjustForSegmentedStacks(
if (!ST->isTargetAndroid() && !ST->isTargetLinux())
report_fatal_error("Segmented stacks not supported on this platform.");
- assert(&PrologueMBB == &MF.front() && "Shrink-wrapping not yet implemented");
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
MCContext &Context = MMI.getContext();
@@ -1913,21 +1870,48 @@ void ARMFrameLowering::adjustForSegmentedStacks(
MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
- for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(),
- e = PrologueMBB.livein_end();
- i != e; ++i) {
- AllocMBB->addLiveIn(*i);
- GetMBB->addLiveIn(*i);
- McrMBB->addLiveIn(*i);
- PrevStackMBB->addLiveIn(*i);
- PostStackMBB->addLiveIn(*i);
+ // Grab everything that reaches PrologueMBB to update there liveness as well.
+ SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion;
+ SmallVector<MachineBasicBlock *, 2> WalkList;
+ WalkList.push_back(&PrologueMBB);
+
+ do {
+ MachineBasicBlock *CurMBB = WalkList.pop_back_val();
+ for (MachineBasicBlock *PredBB : CurMBB->predecessors()) {
+ if (BeforePrologueRegion.insert(PredBB).second)
+ WalkList.push_back(PredBB);
+ }
+ } while (!WalkList.empty());
+
+ // The order in that list is important.
+ // The blocks will all be inserted before PrologueMBB using that order.
+ // Therefore the block that should appear first in the CFG should appear
+ // first in the list.
+ MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
+ PostStackMBB};
+
+ for (MachineBasicBlock *B : AddedBlocks)
+ BeforePrologueRegion.insert(B);
+
+ for (const auto &LI : PrologueMBB.liveins()) {
+ for (MachineBasicBlock *PredBB : BeforePrologueRegion)
+ PredBB->addLiveIn(LI);
+ }
+
+ // Remove the newly added blocks from the list, since we know
+ // we do not have to do the following updates for them.
+ for (MachineBasicBlock *B : AddedBlocks) {
+ BeforePrologueRegion.erase(B);
+ MF.insert(PrologueMBB.getIterator(), B);
}
- MF.push_front(PostStackMBB);
- MF.push_front(AllocMBB);
- MF.push_front(GetMBB);
- MF.push_front(McrMBB);
- MF.push_front(PrevStackMBB);
+ for (MachineBasicBlock *MBB : BeforePrologueRegion) {
+ // Make sure the LiveIns are still sorted and unique.
+ MBB->sortUniqueLiveIns();
+ // Replace the edges to PrologueMBB by edges to the sequences
+ // we are about to add.
+ MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+ }
// The required stack size that is aligned to ARM constant criterion.
AlignedStackSize = alignToARMConstant(StackSize);
@@ -1991,7 +1975,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
MachineConstantPool *MCP = MF.getConstantPool();
- unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment());
+ unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
// ldr SR0, [pc, offset(STACK_LIMIT)]
AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 6fdc5eff5e47..66f4dfb6ef52 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -31,8 +31,6 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- void fixTCReturn(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -52,7 +50,6 @@ public:
unsigned &FrameReg) const override;
int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg, int SPAdj) const;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
@@ -60,6 +57,11 @@ public:
void adjustForSegmentedStacks(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
+
private:
void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b110628a0a86..024244092a34 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -160,11 +160,6 @@ public:
// Thumb Addressing Modes:
bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
- bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset,
- unsigned Scale);
- bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset);
- bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset);
- bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
SDValue &OffImm);
bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
@@ -176,8 +171,6 @@ public:
bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
// Thumb 2 Addressing Modes:
- bool SelectT2ShifterOperandReg(SDValue N,
- SDValue &BaseReg, SDValue &Opc);
bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
SDValue &OffImm);
@@ -278,6 +271,22 @@ private:
// Get the alignment operand for a NEON VLD or VST instruction.
SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
bool is64BitVector);
+
+ /// Returns the number of instructions required to materialize the given
+ /// constant in a register, or 3 if a literal pool load is needed.
+ unsigned ConstantMaterializationCost(unsigned Val) const;
+
+ /// Checks if N is a multiplication by a constant where we can extract out a
+ /// power of two from the constant so that it can be used in a shift, but only
+ /// if it simplifies the materialization of the constant. Returns true if it
+ /// is, and assigns to PowerOfTwo the power of two that should be extracted
+ /// out and to NewMulConst the new constant to be multiplied by.
+ bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
+ unsigned &PowerOfTwo, SDValue &NewMulConst) const;
+
+ /// Replace N with M in CurDAG, in a way that also ensures that M gets
+ /// selected when N would have been selected.
+ void replaceDAGValue(const SDValue &N, SDValue M);
};
}
@@ -334,7 +343,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
bool isThumb2 = Subtarget->isThumb();
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
- SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
if (N->getOpcode() != ISD::ADD)
continue;
@@ -388,7 +397,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
SDValue CPTmp1;
SDValue CPTmp2;
if (isThumb2) {
- if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
+ if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1))
continue;
} else {
if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
@@ -471,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
(ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
}
+unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
+ if (Subtarget->isThumb()) {
+ if (Val <= 255) return 1; // MOV
+ if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+ if (~Val <= 255) return 2; // MOV + MVN
+ if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
+ } else {
+ if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV
+ if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN
+ if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+ if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
+ }
+ if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+ return 3; // Literal pool load
+}
+
+bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
+ unsigned MaxShift,
+ unsigned &PowerOfTwo,
+ SDValue &NewMulConst) const {
+ assert(N.getOpcode() == ISD::MUL);
+ assert(MaxShift > 0);
+
+ // If the multiply is used in more than one place then changing the constant
+ // will make other uses incorrect, so don't.
+ if (!N.hasOneUse()) return false;
+ // Check if the multiply is by a constant
+ ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!MulConst) return false;
+ // If the constant is used in more than one place then modifying it will mean
+ // we need to materialize two constants instead of one, which is a bad idea.
+ if (!MulConst->hasOneUse()) return false;
+ unsigned MulConstVal = MulConst->getZExtValue();
+ if (MulConstVal == 0) return false;
+
+ // Find the largest power of 2 that MulConstVal is a multiple of
+ PowerOfTwo = MaxShift;
+ while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
+ --PowerOfTwo;
+ if (PowerOfTwo == 0) return false;
+ }
+
+ // Only optimise if the new cost is better
+ unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
+ NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
+ unsigned OldCost = ConstantMaterializationCost(MulConstVal);
+ unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+ return NewCost < OldCost;
+}
+
+void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
+ CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
+ CurDAG->ReplaceAllUsesWith(N, M);
+}
+
bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
SDValue &BaseReg,
SDValue &Opc,
@@ -478,6 +542,24 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
if (DisableShifterOp)
return false;
+ // If N is a multiply-by-constant and it's profitable to extract a shift and
+ // use it in a shifted operand do so.
+ if (N.getOpcode() == ISD::MUL) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+ BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
+ N.getOperand(0), NewMulConst)
+ .getNode()),
+ 0);
+ replaceDAGValue(N.getOperand(1), NewMulConst);
+ Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
+ PowerOfTwo),
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
// Don't match base register only case. That is matched to a separate
@@ -662,6 +744,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
}
}
+ // If Offset is a multiply-by-constant and it's profitable to extract a shift
+ // and use it in a shifted operand do so.
+ if (Offset.getOpcode() == ISD::MUL) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
+ replaceDAGValue(Offset.getOperand(1), NewMulConst);
+ ShAmt = PowerOfTwo;
+ ShOpcVal = ARM_AM::lsl;
+ }
+ }
+
Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
SDLoc(N), MVT::i32);
return true;
@@ -1086,77 +1180,13 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
}
bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base,
- SDValue &Offset, unsigned Scale) {
- if (Scale == 4) {
- SDValue TmpBase, TmpOffImm;
- if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
- return false; // We want to select tLDRspi / tSTRspi instead.
-
- if (N.getOpcode() == ARMISD::Wrapper &&
- N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
- return false; // We want to select tLDRpci instead.
- }
-
- if (!CurDAG->isBaseWithConstantOffset(N))
- return false;
-
- // Thumb does not have [sp, r] address mode.
- RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
- RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
- if ((LHSR && LHSR->getReg() == ARM::SP) ||
- (RHSR && RHSR->getReg() == ARM::SP))
- return false;
-
- // FIXME: Why do we explicitly check for a match here and then return false?
- // Presumably to allow something else to match, but shouldn't this be
- // documented?
- int RHSC;
- if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC))
- return false;
-
- Base = N.getOperand(0);
- Offset = N.getOperand(1);
- return true;
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N,
- SDValue &Base,
- SDValue &Offset) {
- return SelectThumbAddrModeRI(N, Base, Offset, 1);
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N,
- SDValue &Base,
- SDValue &Offset) {
- return SelectThumbAddrModeRI(N, Base, Offset, 2);
-}
-
-bool
-ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N,
- SDValue &Base,
- SDValue &Offset) {
- return SelectThumbAddrModeRI(N, Base, Offset, 4);
-}
-
-bool
ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
SDValue &Base, SDValue &OffImm) {
- if (Scale == 4) {
- SDValue TmpBase, TmpOffImm;
- if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
- return false; // We want to select tLDRspi / tSTRspi instead.
-
- if (N.getOpcode() == ARMISD::Wrapper &&
- N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
- return false; // We want to select tLDRpci instead.
- }
-
if (!CurDAG->isBaseWithConstantOffset(N)) {
- if (N.getOpcode() == ARMISD::Wrapper &&
- N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
+ if (N.getOpcode() == ISD::ADD) {
+ return false; // We want to select register offset instead
+ } else if (N.getOpcode() == ARMISD::Wrapper &&
+ N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
Base = N.getOperand(0);
} else {
Base = N;
@@ -1166,23 +1196,6 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
return true;
}
- RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
- RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
- if ((LHSR && LHSR->getReg() == ARM::SP) ||
- (RHSR && RHSR->getReg() == ARM::SP)) {
- ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0));
- ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
- unsigned LHSC = LHS ? LHS->getZExtValue() : 0;
- unsigned RHSC = RHS ? RHS->getZExtValue() : 0;
-
- // Thumb does not have [sp, #imm5] address mode for non-zero imm5.
- if (LHSC != 0 || RHSC != 0) return false;
-
- Base = N;
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
- return true;
- }
-
// If the RHS is + imm5 * scale, fold into addr mode.
int RHSC;
if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
@@ -1191,9 +1204,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
return true;
}
- Base = N.getOperand(0);
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
- return true;
+ // Offset is too large, so use register offset instead.
+ return false;
}
bool
@@ -1263,28 +1275,6 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
//===----------------------------------------------------------------------===//
-bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
- SDValue &Opc) {
- if (DisableShifterOp)
- return false;
-
- ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
-
- // Don't match base register only case. That is matched to a separate
- // lower complexity pattern with explicit register operand.
- if (ShOpcVal == ARM_AM::no_shift) return false;
-
- BaseReg = N.getOperand(0);
- unsigned ShImmVal = 0;
- if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- ShImmVal = RHS->getZExtValue() & 31;
- Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N));
- return true;
- }
-
- return false;
-}
-
bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
SDValue &Base, SDValue &OffImm) {
// Match simple R + imm12 operands.
@@ -1425,6 +1415,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
}
}
+ // If OffReg is a multiply-by-constant and it's profitable to extract a shift
+ // and use it in a shifted operand do so.
+ if (OffReg.getOpcode() == ISD::MUL) {
+ unsigned PowerOfTwo = 0;
+ SDValue NewMulConst;
+ if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
+ replaceDAGValue(OffReg.getOperand(1), NewMulConst);
+ ShAmt = PowerOfTwo;
+ }
+ }
+
ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);
return true;
@@ -2503,25 +2504,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
}
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
- bool UseCP = true;
- if (Subtarget->useMovt(*MF))
- // Thumb2-aware targets have the MOVT instruction, so all immediates can
- // be done with MOV + MOVT, at worst.
- UseCP = false;
- else {
- if (Subtarget->isThumb()) {
- UseCP = (Val > 255 && // MOV
- ~Val > 255 && // MOV + MVN
- !ARM_AM::isThumbImmShiftedVal(Val) && // MOV + LSL
- !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
- } else
- UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV
- ARM_AM::getSOImmVal(~Val) == -1 && // MVN
- !ARM_AM::isSOImmTwoPartVal(Val) && // two instrs.
- !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
- }
-
- if (UseCP) {
+ // If we can't materialize the constant we need to use a literal pool
+ if (ConstantMaterializationCost(Val) > 2) {
SDValue CPIdx = CurDAG->getTargetConstantPool(
ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI->getPointerTy(CurDAG->getDataLayout()));
@@ -3376,7 +3360,7 @@ static void getIntOperandsFromRegisterString(StringRef RegString,
SelectionDAG *CurDAG, SDLoc DL,
std::vector<SDValue>& Ops) {
SmallVector<StringRef, 5> Fields;
- RegString.split(Fields, ":");
+ RegString.split(Fields, ':');
if (Fields.size() > 1) {
bool AllIntFields = true;
@@ -3461,9 +3445,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
// The flags here are common to those allowed for apsr in the A class cores and
// those allowed for the special registers in the M class cores. Returns a
// value representing which flags were present, -1 if invalid.
-static inline int getMClassFlagsMask(StringRef Flags) {
+static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
if (Flags.empty())
- return 0x3;
+ return 0x2 | (int)hasDSP;
return StringSwitch<int>(Flags)
.Case("g", 0x1)
@@ -3492,7 +3476,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
}
// We know we are now handling a write so need to get the mask for the flags.
- int Mask = getMClassFlagsMask(Flags);
+ int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
// Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
// shouldn't have flags present.
@@ -3501,7 +3485,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
// The _g and _nzcvqg versions are only valid if the DSP extension is
// available.
- if (!Subtarget->hasThumb2DSP() && (Mask & 0x2))
+ if (!Subtarget->hasDSP() && (Mask & 0x1))
return -1;
// The register was valid so need to put the mask in the correct place
@@ -3523,7 +3507,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
// The flags permitted for apsr are the same flags that are allowed in
// M class registers. We get the flag value and then shift the flags into
// the correct place to combine with the mask.
- Mask = getMClassFlagsMask(Flags);
+ Mask = getMClassFlagsMask(Flags, true);
if (Mask == -1)
return -1;
return Mask << 2;
@@ -3742,7 +3726,7 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
}
SmallVector<StringRef, 5> Fields;
- StringRef(SpecialReg).split(Fields, "_", 1, false);
+ StringRef(SpecialReg).split(Fields, '_', 1, false);
std::string Reg = Fields[0].str();
StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
@@ -3943,6 +3927,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
// be an immediate and not a memory constraint.
// Fallthrough.
case InlineAsm::Constraint_m:
+ case InlineAsm::Constraint_o:
case InlineAsm::Constraint_Q:
case InlineAsm::Constraint_Um:
case InlineAsm::Constraint_Un:
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 8cc06df71633..9cfb06b00c4b 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -142,6 +142,11 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
+
+ if (!VT.isFloatingPoint() &&
+ VT != MVT::v2i64 && VT != MVT::v1i64)
+ for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+ setOperationAction(Opcode, VT, Legal);
}
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -166,77 +171,78 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// Uses VFP for Thumb libfuncs if available.
if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
- // Single-precision floating-point arithmetic.
- setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
- setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
- setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
- setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
-
- // Double-precision floating-point arithmetic.
- setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
- setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
- setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
- setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
-
- // Single-precision comparisons.
- setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
- setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
- setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
- setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
- setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
- setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
- setLibcallName(RTLIB::UO_F32, "__unordsf2vfp");
- setLibcallName(RTLIB::O_F32, "__unordsf2vfp");
-
- setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE);
- setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ);
-
- // Double-precision comparisons.
- setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
- setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
- setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
- setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
- setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
- setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
- setLibcallName(RTLIB::UO_F64, "__unorddf2vfp");
- setLibcallName(RTLIB::O_F64, "__unorddf2vfp");
-
- setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE);
- setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ);
-
- // Floating-point to integer conversions.
- // i64 conversions are done via library routines even when generating VFP
- // instructions, so use the same ones.
- setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
- setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
- setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
- setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
-
- // Conversions between floating types.
- setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
- setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp");
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const ISD::CondCode Cond;
+ } LibraryCalls[] = {
+ // Single-precision floating-point arithmetic.
+ { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+
+ // Double-precision floating-point arithmetic.
+ { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+
+ // Single-precision comparisons.
+ { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
+ { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
+ { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
+ { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
+ { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
+ { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
+ { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
+ { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },
+
+ // Double-precision comparisons.
+ { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
+ { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
+ { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
+ { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
+ { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
+ { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
+ { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
+ { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },
+
+ // Floating-point to integer conversions.
+ // i64 conversions are done via library routines even when generating VFP
+ // instructions, so use the same ones.
+ { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+
+ // Conversions between floating types.
+ { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
+ { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
+
+ // Integer to floating-point conversions.
+ // i64 conversions are done via library routines even when generating VFP
+ // instructions, so use the same ones.
+ // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+ // e.g., __floatunsidf vs. __floatunssidfvfp.
+ { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+ }
- // Integer to floating-point conversions.
- // i64 conversions are done via library routines even when generating VFP
- // instructions, so use the same ones.
- // FIXME: There appears to be some naming inconsistency in ARM libgcc:
- // e.g., __floatunsidf vs. __floatunssidfvfp.
- setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
- setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
- setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
- setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+ // Set the correct calling convention for ARMv7k WatchOS. It's just
+ // AAPCS_VFP for functions as simple as libcalls.
+ if (Subtarget->isTargetWatchOS()) {
+ for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
+ setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
}
}
@@ -245,8 +251,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
- if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
- !Subtarget->isTargetWindows()) {
+ // RTLIB
+ if (Subtarget->isAAPCS_ABI() &&
+ (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+ Subtarget->isTargetAndroid())) {
static const struct {
const RTLIB::Libcall Op;
const char * const Name;
@@ -334,12 +342,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-
- // Memory operations
- // RTABI chapter 4.3.4
- { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
- { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
- { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
};
for (const auto &LC : LibraryCalls) {
@@ -348,6 +350,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (LC.Cond != ISD::SETCC_INVALID)
setCmpLibcallCC(LC.Op, LC.Cond);
}
+
+ // EABI dependent RTLIB
+ if (TM.Options.EABIVersion == EABI::EABI4 ||
+ TM.Options.EABIVersion == EABI::EABI5) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char *const Name;
+ const CallingConv::ID CC;
+ const ISD::CondCode Cond;
+ } MemOpsLibraryCalls[] = {
+ // Memory operations
+ // RTABI chapter 4.3.4
+ { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+ };
+
+ for (const auto &LC : MemOpsLibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+ }
}
if (Subtarget->isTargetWindows()) {
@@ -364,6 +390,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
+ { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
};
for (const auto &LC : LibraryCalls) {
@@ -373,8 +403,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
// Use divmod compiler-rt calls for iOS 5.0 and later.
- if (Subtarget->getTargetTriple().isiOS() &&
- !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+ if (Subtarget->isTargetWatchOS() ||
+ (Subtarget->isTargetIOS() &&
+ !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
}
@@ -392,6 +423,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
}
+ // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
+ // a __gnu_ prefix (which is the default).
+ if (Subtarget->isTargetAEABI()) {
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
+ setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f");
+ }
+
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
else
@@ -579,7 +618,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
@@ -605,7 +643,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ADDC);
if (Subtarget->isFPOnlySP()) {
- // When targetting a floating-point unit with only single-precision
+ // When targeting a floating-point unit with only single-precision
// operations, f64 is legal for the few double-precision instructions which
// are present However, no double-precision operations other than moves,
// loads and stores are provided by the hardware.
@@ -689,7 +727,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
}
if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
- || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
+ || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
setOperationAction(ISD::MULHS, MVT::i32, Expand);
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
@@ -706,8 +744,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBE, MVT::i32, Custom);
}
+ if (!Subtarget->isThumb1Only())
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
// ARM does not have ROTL.
- setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ }
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
@@ -717,7 +762,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand);
- setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+ // @llvm.readcyclecounter requires the Performance Monitors extension.
+ // Default to the 0 expansion on unsupported platforms.
+ // FIXME: Technically there are older ARM CPUs that have
+ // implementation-specific ways of obtaining this information.
+ if (Subtarget->hasPerfMon())
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
// Only ARMv6 has BSWAP.
if (!Subtarget->hasV6Ops())
@@ -726,15 +776,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
!(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
// These are expanded into libcalls if the cpu doesn't have HW divider.
- setOperationAction(ISD::SDIV, MVT::i32, Expand);
- setOperationAction(ISD::UDIV, MVT::i32, Expand);
+ setOperationAction(ISD::SDIV, MVT::i32, LibCall);
+ setOperationAction(ISD::UDIV, MVT::i32, LibCall);
}
- // FIXME: Also set divmod for SREM on EABI
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
// Register based DivRem for AEABI (RTABI 4.2)
- if (Subtarget->isTargetAEABI()) {
+ if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) {
+ setOperationAction(ISD::SREM, MVT::i64, Custom);
+ setOperationAction(ISD::UREM, MVT::i64, Custom);
+
setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod");
setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
@@ -762,7 +814,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
- setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
@@ -776,13 +827,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (!Subtarget->isTargetMachO()) {
- // Non-MachO platforms may return values in these registers via the
- // personality function.
- setExceptionPointerRegister(ARM::R0);
- setExceptionSelectorRegister(ARM::R1);
- }
-
if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
else
@@ -849,11 +893,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- if (Subtarget->isTargetDarwin()) {
- setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
- setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (Subtarget->useSjLjEH())
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
- }
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
@@ -912,7 +956,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasSinCos()) {
setLibcallName(RTLIB::SINCOS_F32, "sincosf");
setLibcallName(RTLIB::SINCOS_F64, "sincos");
- if (Subtarget->getTargetTriple().isiOS()) {
+ if (Subtarget->isTargetWatchOS()) {
+ setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
+ setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
+ }
+ if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
// For iOS, we don't want to the normal expansion of a libcall to
// sincos. We want to issue a libcall to __sincos_stret.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -928,6 +976,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
if (!Subtarget->isFPOnlySP()) {
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -935,8 +990,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
}
}
+
+ if (Subtarget->hasNEON()) {
+ // vmin and vmax aren't available in a scalar form, so we use
+ // a NEON instruction with an undef lane instead.
+ setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+ }
+
// We have target-specific dag combine patterns for the following nodes:
// ARMISD::VMOVRRD - No need to call setTargetDAGCombine
setTargetDAGCombine(ISD::ADD);
@@ -959,11 +1028,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
//// temporary - rewrite interface to use type
MaxStoresPerMemset = 8;
- MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemsetOptSize = 4;
MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
- MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+ MaxStoresPerMemcpyOptSize = 2;
MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
- MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+ MaxStoresPerMemmoveOptSize = 2;
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
@@ -1054,8 +1123,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::CMOV: return "ARMISD::CMOV";
- case ARMISD::RBIT: return "ARMISD::RBIT";
-
case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
case ARMISD::RRX: return "ARMISD::RRX";
@@ -1069,7 +1136,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
- case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+ case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
+ case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN";
@@ -1082,6 +1150,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK";
+ case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
case ARMISD::VCEQ: return "ARMISD::VCEQ";
case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
@@ -1133,14 +1202,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::UMLAL: return "ARMISD::UMLAL";
case ARMISD::SMLAL: return "ARMISD::SMLAL";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
- case ARMISD::FMAX: return "ARMISD::FMAX";
- case ARMISD::FMIN: return "ARMISD::FMIN";
- case ARMISD::VMAXNM: return "ARMISD::VMAX";
- case ARMISD::VMINNM: return "ARMISD::VMIN";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
case ARMISD::VBSL: return "ARMISD::VBSL";
+ case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
@@ -1449,9 +1515,10 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
- return DAG.getStore(Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(LocMemOffset),
- false, false, 0);
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+ false, false, 0);
}
void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
@@ -1734,9 +1801,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get the address of the callee into a register
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(), false, false,
- false, 0);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
} else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *Sym = S->getSymbol();
@@ -1748,9 +1816,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get the address of the callee into a register
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(), false, false,
- false, 0);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
}
} else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
@@ -1768,7 +1837,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ARMISD::WrapperPIC, dl, PtrVt,
DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(), false, false, true, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, true, 0);
} else if (Subtarget->isTargetCOFF()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
@@ -1781,7 +1851,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Callee =
DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
} else {
// On ELF targets for PIC code, direct calls should go through the PLT
unsigned OpFlags = 0;
@@ -1804,9 +1875,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ARMPCLabelIndex, 4);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(), false, false,
- false, 0);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
} else {
@@ -1821,7 +1893,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// FIXME: handle tail calls differently.
unsigned CallOpc;
- bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
if (Subtarget->isThumb()) {
if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
@@ -1831,8 +1902,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!isDirect && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
- // Emit regular call when code size is the priority
- !HasMinSizeAttr)
+ // Emit regular call when code size is the priority
+ !MF.getFunction()->optForMinSize())
// "mov lr, pc; b _foo" to avoid confusing the RSP
CallOpc = ARMISD::CALL_NOLINK;
else
@@ -2014,6 +2085,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
+ assert(Subtarget->supportsTailCall());
+
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
@@ -2033,26 +2106,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (isCalleeStructRet || isCallerStructRet)
return false;
- // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
- // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
- // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
- // support in the assembler and linker to be used. This would need to be
- // fixed to fully support tail calls in Thumb1.
- //
- // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
- // LR. This means if we need to reload LR, it takes an extra instructions,
- // which outweighs the value of the tail call; but here we don't know yet
- // whether LR is going to be used. Probably the right approach is to
- // generate the tail call here and turn it back into CALL/RET in
- // emitEpilogue if LR is used.
-
- // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
- // but we need to make sure there are enough registers; the only valid
- // registers are the 4 used for parameters. We don't currently do this
- // case.
- if (Subtarget->isThumb1Only())
- return false;
-
// Externally-defined functions with weak linkage should not be
// tail-called on ARM when the OS does not support dynamic
// pre-emption of symbols, as the AAELF spec requires normal calls
@@ -2400,7 +2453,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
if (!CI->isTailCall() || Attr.getValueAsString() == "true")
return false;
- return !Subtarget->isThumb1Only();
+ return true;
}
// Trying to write a 64 bit value so need to split into two 32 bit values first,
@@ -2467,9 +2520,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
}
CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
- SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ SDValue Result =
+ DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 0);
if (RelocM == Reloc::Static)
return Result;
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
@@ -2491,9 +2545,10 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
- Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Argument =
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 0);
SDValue Chain = Argument.getValue(1);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2543,17 +2598,19 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
true);
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
- Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
Chain = Offset.getValue(1);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
- Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
} else {
// local exec model
assert(model == TLSModel::LocalExec);
@@ -2561,9 +2618,10 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
- Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, dl, Chain, Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
}
// The address of the thread local variable is the add of the thread
@@ -2577,6 +2635,8 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() &&
"TLS not implemented for non-ELF targets");
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -2597,22 +2657,31 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
SDLoc dl(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
- bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
- ARMConstantPoolValue *CPV =
- ARMConstantPoolConstant::Create(GV,
- UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+ bool UseGOT_PREL =
+ !(GV->hasHiddenVisibility() || GV->hasLocalLinkage());
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+ GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+ UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+ /*AddCurrentAddress=*/UseGOT_PREL);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
- CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ SDValue Result = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
SDValue Chain = Result.getValue(1);
- SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
- Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
- if (!UseGOTOFF)
+ SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+ Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+ if (UseGOT_PREL)
Result = DAG.getLoad(PtrVT, dl, Chain, Result,
- MachinePointerInfo::getGOT(),
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
false, false, false, 0);
return Result;
}
@@ -2628,9 +2697,10 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
} else {
SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ return DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
}
}
@@ -2654,7 +2724,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
return Result;
}
@@ -2680,32 +2751,11 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
TargetFlags));
if (GV->hasDLLImportStorageClass())
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
return Result;
}
-SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Subtarget->isTargetELF() &&
- "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
- MachineFunction &MF = DAG.getMachineFunction();
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDLoc dl(Op);
- unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
- ARMConstantPoolValue *CPV =
- ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
- ARMPCLabelIndex, PCAdj);
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
- CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
- SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
- return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
-}
-
SDValue
ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -2722,6 +2772,13 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
}
+SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
+ Op.getOperand(0));
+}
+
SDValue
ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const {
@@ -2732,7 +2789,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_rbit: {
assert(Op.getOperand(1).getValueType() == MVT::i32 &&
"RBIT intrinsic must have i32 type!");
- return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
+ return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
}
case Intrinsic::arm_thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2752,10 +2809,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
ARMCP::CPLSDA, PCAdj);
CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- SDValue Result =
- DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ SDValue Result = DAG.getLoad(
+ PtrVT, dl, DAG.getEntryNode(), CPAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
if (RelocM == Reloc::PIC_) {
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
@@ -2770,6 +2827,36 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
+ case Intrinsic::arm_neon_vminnm:
+ case Intrinsic::arm_neon_vmaxnm: {
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
+ ? ISD::FMINNUM : ISD::FMAXNUM;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::arm_neon_vminu:
+ case Intrinsic::arm_neon_vmaxu: {
+ if (Op.getValueType().isFloatingPoint())
+ return SDValue();
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
+ ? ISD::UMIN : ISD::UMAX;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::arm_neon_vmins:
+ case Intrinsic::arm_neon_vmaxs: {
+ // v{min,max}s is overloaded between signed integers and floats.
+ if (!Op.getValueType().isFloatingPoint()) {
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+ ? ISD::SMIN : ISD::SMAX;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+ ? ISD::FMINNAN : ISD::FMAXNAN;
+ return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
}
}
@@ -2870,9 +2957,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
// Create load node to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ ArgValue2 = DAG.getLoad(
+ MVT::i32, dl, Root, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+ false, false, 0);
} else {
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
@@ -3056,9 +3144,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
if (VA.isMemLoc()) {
int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ ArgValue2 = DAG.getLoad(
+ MVT::f64, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, false, 0);
} else {
ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
Chain, DAG, dl);
@@ -3139,9 +3228,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
"Byval arguments cannot be implicit");
unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
- int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
- CurByValIndex, VA.getLocMemOffset(),
- Flags.getByValSize());
+ int FrameIndex = StoreByValRegs(
+ CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
+ VA.getLocMemOffset(), Flags.getByValSize());
InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
CCInfo.nextInRegsParam();
} else {
@@ -3151,9 +3240,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0));
+ InVals.push_back(DAG.getLoad(
+ VA.getValVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, false, 0));
}
lastInsIndex = index;
}
@@ -3188,13 +3278,9 @@ static bool isFloatingPointZero(SDValue Op) {
// Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
// created by LowerConstantFP().
SDValue BitcastOp = Op->getOperand(0);
- if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
- SDValue MoveOp = BitcastOp->getOperand(0);
- if (MoveOp->getOpcode() == ISD::TargetConstant &&
- cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
- return true;
- }
- }
+ if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(BitcastOp->getOperand(0)))
+ return true;
}
return false;
}
@@ -3559,113 +3645,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// Try to generate VMAXNM/VMINNM on ARMv8.
if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
TrueVal.getValueType() == MVT::f64)) {
- // We can use VMAXNM/VMINNM for a compare followed by a select with the
- // same operands, as follows:
- // c = fcmp [?gt, ?ge, ?lt, ?le] a, b
- // select c, a, b
- // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'.
- bool swapSides = false;
- if (!getTargetMachine().Options.NoNaNsFPMath) {
- // transformability may depend on which way around we compare
- switch (CC) {
- default:
- break;
- case ISD::SETOGT:
- case ISD::SETOGE:
- case ISD::SETOLT:
- case ISD::SETOLE:
- // the non-NaN should be RHS
- swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS);
- break;
- case ISD::SETUGT:
- case ISD::SETUGE:
- case ISD::SETULT:
- case ISD::SETULE:
- // the non-NaN should be LHS
- swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS);
- break;
- }
- }
- swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal);
- if (swapSides) {
- CC = ISD::getSetCCSwappedOperands(CC);
- std::swap(LHS, RHS);
- }
- if (LHS == TrueVal && RHS == FalseVal) {
- bool canTransform = true;
- // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here
- if (!getTargetMachine().Options.UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
- const ConstantFPSDNode *Zero;
- switch (CC) {
- default:
- break;
- case ISD::SETOGT:
- case ISD::SETUGT:
- case ISD::SETGT:
- // RHS must not be -0
- canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
- !Zero->isNegative();
- break;
- case ISD::SETOGE:
- case ISD::SETUGE:
- case ISD::SETGE:
- // LHS must not be -0
- canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
- !Zero->isNegative();
- break;
- case ISD::SETOLT:
- case ISD::SETULT:
- case ISD::SETLT:
- // RHS must not be +0
- canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
- Zero->isNegative();
- break;
- case ISD::SETOLE:
- case ISD::SETULE:
- case ISD::SETLE:
- // LHS must not be +0
- canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
- Zero->isNegative();
- break;
- }
- }
- if (canTransform) {
- // Note: If one of the elements in a pair is a number and the other
- // element is NaN, the corresponding result element is the number.
- // This is consistent with the IEEE 754-2008 standard.
- // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN
- switch (CC) {
- default:
- break;
- case ISD::SETOGT:
- case ISD::SETOGE:
- if (!DAG.isKnownNeverNaN(RHS))
- break;
- return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
- case ISD::SETUGT:
- case ISD::SETUGE:
- if (!DAG.isKnownNeverNaN(LHS))
- break;
- case ISD::SETGT:
- case ISD::SETGE:
- return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
- case ISD::SETOLT:
- case ISD::SETOLE:
- if (!DAG.isKnownNeverNaN(RHS))
- break;
- return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
- case ISD::SETULT:
- case ISD::SETULE:
- if (!DAG.isKnownNeverNaN(LHS))
- break;
- case ISD::SETLT:
- case ISD::SETLE:
- return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
- }
- }
- }
-
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -3890,16 +3869,18 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
Addr, Op.getOperand(2), JTI);
}
if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
- Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
- MachinePointerInfo::getJumpTable(),
- false, false, false, 0);
+ Addr =
+ DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+ false, false, false, 0);
Chain = Addr.getValue(1);
Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
} else {
- Addr = DAG.getLoad(PTy, dl, Chain, Addr,
- MachinePointerInfo::getJumpTable(),
- false, false, false, 0);
+ Addr =
+ DAG.getLoad(PTy, dl, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+ false, false, false, 0);
Chain = Addr.getValue(1);
return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
}
@@ -3936,7 +3917,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
Op.getValueType());
- return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+ return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
/*isSigned*/ false, SDLoc(Op)).first;
}
@@ -3988,7 +3969,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
Op.getValueType());
- return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+ return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
/*isSigned*/ false, SDLoc(Op)).first;
}
@@ -4153,6 +4134,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
Results.push_back(Read.getOperand(0));
}
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+ SelectionDAG &DAG) {
+ SDValue Op = BC->getOperand(0);
+ EVT DstVT = BC->getValueType(0);
+
+ // The only vector instruction that can produce a scalar (remember,
+ // since the bitcast was about to be turned into VMOVDRR, the source
+ // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+ // Moreover, we can do this combine only if there is one use.
+ // Finally, if the destination type is not a vector, there is not
+ // much point on forcing everything on the vector bank.
+ if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !Op.hasOneUse())
+ return SDValue();
+
+ // If the index is not constant, we will introduce an additional
+ // multiply that will stick.
+ // Give up in that case.
+ ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!Index)
+ return SDValue();
+ unsigned DstNumElt = DstVT.getVectorNumElements();
+
+ // Compute the new index.
+ const APInt &APIntIndex = Index->getAPIntValue();
+ APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+ NewIndex *= APIntIndex;
+ // Check if the new constant index fits into i32.
+ if (NewIndex.getBitWidth() > 32)
+ return SDValue();
+
+ // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+ // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+ SDLoc dl(Op);
+ SDValue ExtractSrc = Op.getOperand(0);
+ EVT VecVT = EVT::getVectorVT(
+ *DAG.getContext(), DstVT.getScalarType(),
+ ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+ SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+ DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
/// ExpandBITCAST - If the target supports VFP, this function is called to
/// expand a bit convert where either the source or destination type is i64 to
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
@@ -4172,6 +4203,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
// Turn i64->f64 into VMOVDRR.
if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+ // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+ // if we can combine the bitcast with its source.
+ if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+ return Val;
+
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
DAG.getConstant(0, dl, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
@@ -4383,7 +4419,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
if (!ST->hasV6T2Ops())
return SDValue();
- SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+ SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
}
@@ -4544,8 +4580,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
"Unknown shift to lower!");
// We only lower SRA, SRL of 1 here, all others use generic lowering.
- if (!isa<ConstantSDNode>(N->getOperand(1)) ||
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+ if (!isOneConstant(N->getOperand(1)))
return SDValue();
// If we are in thumb mode, we don't have RRX.
@@ -5036,18 +5071,56 @@ static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
return VT == MVT::v8i8 && M.size() == 8;
}
+// Checks whether the shuffle mask represents a vector transpose (VTRN) by
+// checking that pairs of elements in the shuffle mask represent the same index
+// in each vector, incrementing the expected index by 2 at each step.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
+// v2={e,f,g,h}
+// WhichResult gives the offset for each element in the mask based on which
+// of the two results it belongs to.
+//
+// The transpose can be represented either as:
+// result1 = shufflevector v1, v2, result1_shuffle_mask
+// result2 = shufflevector v1, v2, result2_shuffle_mask
+// where v1/v2 and the shuffle masks have the same number of elements
+// (here WhichResult (see below) indicates which result is being checked)
+//
+// or as:
+// results = shufflevector v1, v2, shuffle_mask
+// where both results are returned in one vector and the shuffle mask has twice
+// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
+// want to check the low half and high half of the shuffle mask as if it were
+// the other case
static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned i = 0; i < NumElts; i += 2) {
- if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
- (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
- return false;
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ // If the mask is twice as long as the input vector then we need to check the
+ // upper and lower parts of the mask with a matching value for WhichResult
+ // FIXME: A mask with only even values will be rejected in case the first
+ // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
+ // M[0] is used to determine WhichResult
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ if (M.size() == NumElts * 2)
+ WhichResult = i / NumElts;
+ else
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
+ return false;
+ }
}
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
return true;
}
@@ -5060,28 +5133,55 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned i = 0; i < NumElts; i += 2) {
- if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
- (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
- return false;
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ if (M.size() == NumElts * 2)
+ WhichResult = i / NumElts;
+ else
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
+ return false;
+ }
}
+
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
return true;
}
+// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
+// that the mask elements are either all even and in steps of size 2 or all odd
+// and in steps of size 2.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
+// v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with
+// respect the how results are returned.
static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned i = 0; i != NumElts; ++i) {
- if (M[i] < 0) continue; // ignore UNDEF indices
- if ((unsigned) M[i] != 2 * i + WhichResult)
- return false;
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; ++j) {
+ if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
+ return false;
+ }
}
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
@@ -5097,18 +5197,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
if (EltSz == 64)
return false;
- unsigned Half = VT.getVectorNumElements() / 2;
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned j = 0; j != 2; ++j) {
- unsigned Idx = WhichResult;
- for (unsigned i = 0; i != Half; ++i) {
- int MIdx = M[i + j * Half];
- if (MIdx >= 0 && (unsigned) MIdx != Idx)
- return false;
- Idx += 2;
+ unsigned NumElts = VT.getVectorNumElements();
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ unsigned Half = NumElts / 2;
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ for (unsigned j = 0; j < NumElts; j += Half) {
+ unsigned Idx = WhichResult;
+ for (unsigned k = 0; k < Half; ++k) {
+ int MIdx = M[i + j + k];
+ if (MIdx >= 0 && (unsigned) MIdx != Idx)
+ return false;
+ Idx += 2;
+ }
}
}
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
@@ -5116,21 +5225,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return true;
}
+// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
+// that pairs of elements of the shufflemask represent the same index in each
+// vector incrementing sequentially through the vectors.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
+// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
+// v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with respect the how results
+// are returned.
static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
- WhichResult = (M[0] == 0 ? 0 : 1);
- unsigned Idx = WhichResult * NumElts / 2;
- for (unsigned i = 0; i != NumElts; i += 2) {
- if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
- (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
- return false;
- Idx += 1;
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
+ return false;
+ Idx += 1;
+ }
}
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
@@ -5147,15 +5272,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;
unsigned NumElts = VT.getVectorNumElements();
- WhichResult = (M[0] == 0 ? 0 : 1);
- unsigned Idx = WhichResult * NumElts / 2;
- for (unsigned i = 0; i != NumElts; i += 2) {
- if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
- (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
- return false;
- Idx += 1;
+ if (M.size() != NumElts && M.size() != NumElts*2)
+ return false;
+
+ for (unsigned i = 0; i < M.size(); i += NumElts) {
+ WhichResult = M[i] == 0 ? 0 : 1;
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned j = 0; j < NumElts; j += 2) {
+ if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+ (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
+ return false;
+ Idx += 1;
+ }
}
+ if (M.size() == NumElts*2)
+ WhichResult = 0;
+
// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
@@ -5329,16 +5462,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// just use VDUPLANE. We can only do this if the lane being extracted
// is at a constant index, as the VDUP from lane instructions only have
// constant-index forms.
+ ConstantSDNode *constIndex;
if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- isa<ConstantSDNode>(Value->getOperand(1))) {
+ (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
// We need to create a new undef vector to use for the VDUPLANE if the
// size of the vector from which we get the value is different than the
// size of the vector that we need to create. We will insert the element
// such that the register coalescer will remove unnecessary copies.
if (VT != Value->getOperand(0).getValueType()) {
- ConstantSDNode *constIndex;
- constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
- assert(constIndex && "The index is not a constant!");
unsigned index = constIndex->getAPIntValue().getLimitedValue() %
VT.getVectorNumElements();
N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
@@ -5437,14 +5568,35 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// shuffle in combination with VEXTs.
SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
- SmallVector<SDValue, 2> SourceVecs;
- SmallVector<unsigned, 2> MinElts;
- SmallVector<unsigned, 2> MaxElts;
+ struct ShuffleSourceInfo {
+ SDValue Vec;
+ unsigned MinElt;
+ unsigned MaxElt;
+
+ // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+ // be compatible with the shuffle we intend to construct. As a result
+ // ShuffleVec will be some sliding window into the original Vec.
+ SDValue ShuffleVec;
+
+ // Code should guarantee that element i in Vec starts at element "WindowBase
+ // + i * WindowScale in ShuffleVec".
+ int WindowBase;
+ int WindowScale;
+
+ bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+ ShuffleSourceInfo(SDValue Vec)
+ : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+ WindowScale(1) {}
+ };
+ // First gather all vectors used as an immediate source for this BUILD_VECTOR
+ // node.
+ SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() == ISD::UNDEF)
@@ -5453,127 +5605,166 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
// A shuffle can only come from building a vector from various
// elements of other vectors.
return SDValue();
- } else if (V.getOperand(0).getValueType().getVectorElementType() !=
- VT.getVectorElementType()) {
- // This code doesn't know how to handle shuffles where the vector
- // element types do not match (this happens because type legalization
- // promotes the return type of EXTRACT_VECTOR_ELT).
- // FIXME: It might be appropriate to extend this code to handle
- // mismatched types.
+ } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
+ // Furthermore, shuffles require a constant mask, whereas extractelts
+ // accept variable indices.
return SDValue();
}
- // Record this extraction against the appropriate vector if possible...
+ // Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
- // If the element number isn't a constant, we can't effectively
- // analyze what's going on.
- if (!isa<ConstantSDNode>(V.getOperand(1)))
- return SDValue();
- unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
- bool FoundSource = false;
- for (unsigned j = 0; j < SourceVecs.size(); ++j) {
- if (SourceVecs[j] == SourceVec) {
- if (MinElts[j] > EltNo)
- MinElts[j] = EltNo;
- if (MaxElts[j] < EltNo)
- MaxElts[j] = EltNo;
- FoundSource = true;
- break;
- }
- }
+ auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+ if (Source == Sources.end())
+ Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
- // Or record a new source if not...
- if (!FoundSource) {
- SourceVecs.push_back(SourceVec);
- MinElts.push_back(EltNo);
- MaxElts.push_back(EltNo);
- }
+ // Update the minimum and maximum lane number seen.
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ Source->MinElt = std::min(Source->MinElt, EltNo);
+ Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
// Currently only do something sane when at most two source vectors
- // involved.
- if (SourceVecs.size() > 2)
+ // are involved.
+ if (Sources.size() > 2)
return SDValue();
- SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
- int VEXTOffsets[2] = {0, 0};
+ // Find out the smallest element size among result and two sources, and use
+ // it as element size to build the shuffle_vector.
+ EVT SmallestEltTy = VT.getVectorElementType();
+ for (auto &Source : Sources) {
+ EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+ if (SrcEltTy.bitsLT(SmallestEltTy))
+ SmallestEltTy = SrcEltTy;
+ }
+ unsigned ResMultiplier =
+ VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+ NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+ // If the source vector is too wide or too narrow, we may nevertheless be able
+ // to construct a compatible shuffle either by concatenating it with UNDEF or
+ // extracting a suitable range of elements.
+ for (auto &Src : Sources) {
+ EVT SrcVT = Src.ShuffleVec.getValueType();
+
+ if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+ continue;
+
+ // This stage of the search produces a source with the same element type as
+ // the original, but with a total width matching the BUILD_VECTOR output.
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
- // This loop extracts the usage patterns of the source vectors
- // and prepares appropriate SDValues for a shuffle if possible.
- for (unsigned i = 0; i < SourceVecs.size(); ++i) {
- if (SourceVecs[i].getValueType() == VT) {
- // No VEXT necessary
- ShuffleSrcs[i] = SourceVecs[i];
- VEXTOffsets[i] = 0;
+ if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+ if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+ return SDValue();
+ // We can pad out the smaller vector for free, so if it's part of a
+ // shuffle...
+ Src.ShuffleVec =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+ DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
- } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
- // It probably isn't worth padding out a smaller vector just to
- // break it down again in a shuffle.
- return SDValue();
}
- // Since only 64-bit and 128-bit vectors are legal on ARM and
- // we've eliminated the other cases...
- assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
- "unexpected vector sizes in ReconstructShuffle");
+ if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+ return SDValue();
- if (MaxElts[i] - MinElts[i] >= NumElts) {
+ if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
// Span too large for a VEXT to cope
return SDValue();
}
- if (MinElts[i] >= NumElts) {
+ if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
- VEXTOffsets[i] = NumElts;
- ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- SourceVecs[i],
- DAG.getIntPtrConstant(NumElts, dl));
- } else if (MaxElts[i] < NumElts) {
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i32));
+ Src.WindowBase = -NumSrcElts;
+ } else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
- VEXTOffsets[i] = 0;
- ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- SourceVecs[i],
- DAG.getIntPtrConstant(0, dl));
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i32));
} else {
// An actual VEXT is needed
- VEXTOffsets[i] = MinElts[i];
- SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- SourceVecs[i],
- DAG.getIntPtrConstant(0, dl));
- SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- SourceVecs[i],
- DAG.getIntPtrConstant(NumElts, dl));
- ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
- DAG.getConstant(VEXTOffsets[i], dl,
- MVT::i32));
- }
- }
-
- SmallVector<int, 8> Mask;
-
- for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue VEXTSrc1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue VEXTSrc2 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, dl, MVT::i32));
+
+ Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
+ VEXTSrc2,
+ DAG.getConstant(Src.MinElt, dl, MVT::i32));
+ Src.WindowBase = -Src.MinElt;
+ }
+ }
+
+ // Another possible incompatibility occurs from the vector element types. We
+ // can fix this by bitcasting the source vectors to the same type we intend
+ // for the shuffle.
+ for (auto &Src : Sources) {
+ EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+ if (SrcEltTy == SmallestEltTy)
+ continue;
+ assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+ Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ Src.WindowBase *= Src.WindowScale;
+ }
+
+ // Final sanity check before we try to actually produce a shuffle.
+ DEBUG(
+ for (auto Src : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+ );
+
+ // The stars all align, our next step is to produce the mask for the shuffle.
+ SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+ int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
- if (Entry.getOpcode() == ISD::UNDEF) {
- Mask.push_back(-1);
+ if (Entry.getOpcode() == ISD::UNDEF)
continue;
- }
- SDValue ExtractVec = Entry.getOperand(0);
- int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
- .getOperand(1))->getSExtValue();
- if (ExtractVec == SourceVecs[0]) {
- Mask.push_back(ExtractElt - VEXTOffsets[0]);
- } else {
- Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
- }
+ auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+ int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+ // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+ // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+ // segment.
+ EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+ int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+ VT.getVectorElementType().getSizeInBits());
+ int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+ // This source is expected to fill ResMultiplier lanes of the final shuffle,
+ // starting at the appropriate offset.
+ int *LaneMask = &Mask[i * ResMultiplier];
+
+ int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+ ExtractBase += NumElts * (Src - Sources.begin());
+ for (int j = 0; j < LanesDefined; ++j)
+ LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
- if (isShuffleMaskLegal(Mask, VT))
- return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
- &Mask[0]);
+ if (!isShuffleMaskLegal(Mask, ShuffleVT))
+ return SDValue();
- return SDValue();
+ // We can't handle more than two sources. This should have already
+ // been checked before this point.
+ assert(Sources.size() <= 2 && "Too many sources!");
+
+ SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+ for (unsigned i = 0; i < Sources.size(); ++i)
+ ShuffleOps[i] = Sources[i].ShuffleVec;
+
+ SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], &Mask[0]);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
/// isShuffleMaskLegal - Targets can use this to indicate that they only
@@ -6235,6 +6426,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
static SDValue
LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
+
// Convert to float
// float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
// float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
@@ -6265,6 +6458,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
static SDValue
LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
+
SDValue N2;
// Convert to float.
// float4 yf = vcvt_f32_s32(vmovl_s16(y));
@@ -6337,6 +6532,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+ // TODO: Should this propagate fast-math-flags?
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
"unexpected type for custom-lowering ISD::UDIV");
@@ -6445,45 +6641,56 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
auto PtrVT = getPointerTy(DAG.getDataLayout());
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Pair of floats / doubles used to pass the result.
- StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
-
- // Create stack object for sret.
+ Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
auto &DL = DAG.getDataLayout();
- const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
- const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
- int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
- SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
ArgListTy Args;
- ArgListEntry Entry;
-
- Entry.Node = SRet;
- Entry.Ty = RetTy->getPointerTo();
- Entry.isSExt = false;
- Entry.isZExt = false;
- Entry.isSRet = true;
- Args.push_back(Entry);
+ bool ShouldUseSRet = Subtarget->isAPCS_ABI();
+ SDValue SRet;
+ if (ShouldUseSRet) {
+ // Create stack object for sret.
+ const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+ const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+ int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+ SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+
+ ArgListEntry Entry;
+ Entry.Node = SRet;
+ Entry.Ty = RetTy->getPointerTo();
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Entry.isSRet = true;
+ Args.push_back(Entry);
+ RetTy = Type::getVoidTy(*DAG.getContext());
+ }
+ ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.isSExt = false;
Entry.isZExt = false;
Args.push_back(Entry);
- const char *LibcallName = (ArgVT == MVT::f64)
- ? "__sincos_stret" : "__sincosf_stret";
+ const char *LibcallName =
+ (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+ RTLIB::Libcall LC =
+ (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
+ CallingConv::ID CC = getLibcallCallingConv(LC);
SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
- std::move(Args), 0)
- .setDiscardResult();
-
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setCallee(CC, RetTy, Callee, std::move(Args), 0)
+ .setDiscardResult(ShouldUseSRet);
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ if (!ShouldUseSRet)
+ return CallResult.first;
+
SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
MachinePointerInfo(), false, false, false, 0);
@@ -6498,6 +6705,85 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
LoadSin.getValue(0), LoadCos.getValue(0));
}
+SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
+ bool Signed,
+ SDValue &Chain) const {
+ EVT VT = Op.getValueType();
+ assert((VT == MVT::i32 || VT == MVT::i64) &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ const auto &DL = DAG.getDataLayout();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+
+ const char *Name = nullptr;
+ if (Signed)
+ Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+ else
+ Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+
+ SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+
+ ARMTargetLowering::ArgListTy Args;
+
+ for (auto AI : {1, 0}) {
+ ArgListEntry Arg;
+ Arg.Node = Op.getOperand(AI);
+ Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
+ Args.push_back(Arg);
+ }
+
+ CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
+ ES, std::move(Args), 0);
+
+ return LowerCallTo(CLI).first;
+}
+
+SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ assert(Op.getValueType() == MVT::i32 &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
+ DAG.getEntryNode(), Op.getOperand(1));
+
+ return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+}
+
+void ARMTargetLowering::ExpandDIV_Windows(
+ SDValue Op, SelectionDAG &DAG, bool Signed,
+ SmallVectorImpl<SDValue> &Results) const {
+ const auto &DL = DAG.getDataLayout();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+
+ assert(Op.getValueType() == MVT::i64 &&
+ "unexpected type for custom lowering DIV");
+ SDLoc dl(Op);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+ DAG.getConstant(1, dl, MVT::i32));
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi);
+
+ SDValue DBZCHK =
+ DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or);
+
+ SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+
+ SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
+ SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
+ DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+ Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
+
+ Results.push_back(Lower);
+ Results.push_back(Upper);
+}
+
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
// Monotonic load/store is legal for all targets
if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -6513,36 +6799,22 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
SDLoc DL(N);
- SDValue Cycles32, OutChain;
-
- if (Subtarget->hasPerfMon()) {
- // Under Power Management extensions, the cycle-count is:
- // mrc p15, #0, <Rt>, c9, c13, #0
- SDValue Ops[] = { N->getOperand(0), // Chain
- DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
- DAG.getConstant(15, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(9, DL, MVT::i32),
- DAG.getConstant(13, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32)
- };
-
- Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
- DAG.getVTList(MVT::i32, MVT::Other), Ops);
- OutChain = Cycles32.getValue(1);
- } else {
- // Intrinsic is defined to return 0 on unsupported platforms. Technically
- // there are older ARM CPUs that have implementation-specific ways of
- // obtaining this information (FIXME!).
- Cycles32 = DAG.getConstant(0, DL, MVT::i32);
- OutChain = DAG.getEntryNode();
- }
-
+ // Under Power Management extensions, the cycle-count is:
+ // mrc p15, #0, <Rt>, c9, c13, #0
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getConstant(15, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(9, DL, MVT::i32),
+ DAG.getConstant(13, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32)
+ };
- SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
- Cycles32, DAG.getConstant(0, DL, MVT::i32));
- Results.push_back(Cycles64);
- Results.push_back(OutChain);
+ SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ DAG.getVTList(MVT::i32, MVT::Other), Ops);
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
+ DAG.getConstant(0, DL, MVT::i32)));
+ Results.push_back(Cycles32.getValue(1));
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -6576,15 +6848,17 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
- case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
+ case ISD::SREM: return LowerREM(Op.getNode(), DAG);
+ case ISD::UREM: return LowerREM(Op.getNode(), DAG);
case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
@@ -6622,13 +6896,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Don't know how to custom lower this!");
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ARMISD::WIN__DBZCHK: return SDValue();
}
}
/// ReplaceNodeResults - Replace the results of node with an illegal result
/// type with new values built out of custom code.
void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
- SmallVectorImpl<SDValue>&Results,
+ SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res;
switch (N->getOpcode()) {
@@ -6644,9 +6919,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SRA:
Res = Expand64BitShift(N, DAG, Subtarget);
break;
+ case ISD::SREM:
+ case ISD::UREM:
+ Res = LowerREM(N, DAG);
+ break;
case ISD::READCYCLECOUNTER:
ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
return;
+ case ISD::UDIV:
+ case ISD::SDIV:
+ assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
+ return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
+ Results);
}
if (Res.getNode())
Results.push_back(Res);
@@ -6683,12 +6967,12 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
// Grab constant pool and fixed stack memory operands.
MachineMemOperand *CPMMO =
- MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
- MachineMemOperand::MOLoad, 4, 4);
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
MachineMemOperand *FIMMOSt =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOStore, 4, 4);
+ MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+ MachineMemOperand::MOStore, 4, 4);
// Load the address of the dispatch MBB into the jump buffer.
if (isThumb2) {
@@ -6792,7 +7076,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
MachineModuleInfo &MMI = MF->getMMI();
for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
++BB) {
- if (!BB->isLandingPad()) continue;
+ if (!BB->isEHPad()) continue;
// FIXME: We should assert that the EH_LABEL is the first MI in the landing
// pad.
@@ -6807,7 +7091,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
for (SmallVectorImpl<unsigned>::iterator
CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
CSI != CSE; ++CSI) {
- CallSiteNumToLPad[*CSI].push_back(BB);
+ CallSiteNumToLPad[*CSI].push_back(&*BB);
MaxCSNum = std::max(MaxCSNum, *CSI);
}
break;
@@ -6840,7 +7124,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
// Shove the dispatch's address into the return slot in the function context.
MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
- DispatchBB->setIsLandingPad();
+ DispatchBB->setIsEHPad();
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
unsigned trap_opcode;
@@ -6864,10 +7148,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
// context.
SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
- MachineMemOperand *FIMMOLd =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOVolatile, 4, 4);
+ MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI),
+ MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
MachineInstrBuilder MIB;
MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
@@ -6982,9 +7265,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
.addReg(NewVReg2, RegState::Kill)
.addReg(NewVReg3));
- MachineMemOperand *JTMMOLd =
- MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
@@ -7066,9 +7348,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
.addJumpTableIndex(MJTI));
- MachineMemOperand *JTMMOLd =
- MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+ MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
AddDefaultPred(
BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
@@ -7109,13 +7390,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
BB->succ_end());
while (!Successors.empty()) {
MachineBasicBlock *SMBB = Successors.pop_back_val();
- if (SMBB->isLandingPad()) {
+ if (SMBB->isEHPad()) {
BB->removeSuccessor(SMBB);
MBBLPads.push_back(SMBB);
}
}
- BB->addSuccessor(DispatchBB);
+ BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+ BB->normalizeSuccProbs();
// Find the invoke call and mark all of the callee-saved registers as
// 'implicit defined' so that they're spilled. This prevents code from
@@ -7157,7 +7439,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
// landing pad now.
for (SmallVectorImpl<MachineBasicBlock*>::iterator
I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
- (*I)->setIsLandingPad(false);
+ (*I)->setIsEHPad(false);
// The instruction is gone now.
MI->eraseFromParent();
@@ -7280,8 +7562,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
// Otherwise, we will generate unrolled scalar copies.
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
unsigned dest = MI->getOperand(0).getReg();
unsigned src = MI->getOperand(1).getReg();
@@ -7574,6 +7855,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
}
MachineBasicBlock *
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
+ MF->push_back(ContBB);
+ ContBB->splice(ContBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ MBB->addSuccessor(ContBB);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ MF->push_back(TrapBB);
+ BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249);
+ MBB->addSuccessor(TrapBB);
+
+ BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
+ .addReg(MI->getOperand(0).getReg())
+ .addMBB(TrapBB);
+
+ MI->eraseFromParent();
+ return ContBB;
+}
+
+MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -7643,8 +7950,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -7741,6 +8047,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case ARM::tInt_eh_sjlj_setjmp:
case ARM::t2Int_eh_sjlj_setjmp:
case ARM::t2Int_eh_sjlj_setjmp_nofp:
+ return BB;
+
+ case ARM::Int_eh_sjlj_setup_dispatch:
EmitSjLjDispatchBlock(MI, BB);
return BB;
@@ -7759,8 +8068,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
// SinkBB: V1 = PHI(V2, V3)
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator BBI = BB;
- ++BBI;
+ MachineFunction::iterator BBI = ++BB->getIterator();
MachineFunction *Fn = BB->getParent();
MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
@@ -7824,11 +8132,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
return EmitStructByval(MI, BB);
case ARM::WIN__CHKSTK:
return EmitLowered__chkstk(MI, BB);
+ case ARM::WIN__DBZCHK:
+ return EmitLowered__dbzchk(MI, BB);
+ }
+}
+
+/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// when it is expanded into LDM/STM. This is done as a post-isel lowering
+/// instead of as a custom inserter because we need the use list from the SDNode.
+static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
+ MachineInstr *MI, const SDNode *Node) {
+ bool isThumb1 = Subtarget->isThumb1Only();
+
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstrBuilder MIB(*MF, MI);
+
+ // If the new dst/src is unused mark it as dead.
+ if (!Node->hasAnyUseOfValue(0)) {
+ MI->getOperand(0).setIsDead(true);
+ }
+ if (!Node->hasAnyUseOfValue(1)) {
+ MI->getOperand(1).setIsDead(true);
+ }
+
+ // The MEMCPY both defines and kills the scratch registers.
+ for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
+ unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+ : &ARM::GPRRegClass);
+ MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
}
}
void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
SDNode *Node) const {
+ if (MI->getOpcode() == ARM::MEMCPY) {
+ attachMEMCPYScratchRegs(Subtarget, MI, Node);
+ return;
+ }
+
const MCInstrDesc *MCID = &MI->getDesc();
// Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
// RSC. Coming out of isel, they have an implicit CPSR def, but the optional
@@ -7898,10 +8241,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
// Helper function that checks if N is a null or all ones constant.
static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
- if (!C)
- return false;
- return AllOnes ? C->isAllOnesValue() : C->isNullValue();
+ return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
}
// Return true if N is conditionally 0 or all ones.
@@ -8723,12 +9063,88 @@ static SDValue PerformXORCombine(SDNode *N,
return SDValue();
}
-/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
-/// the bits being cleared by the AND are not demanded by the BFI.
+// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
+// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
+// their position in "to" (Rd).
+static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
+ assert(N->getOpcode() == ARMISD::BFI);
+
+ SDValue From = N->getOperand(1);
+ ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+ FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
+
+ // If the Base came from a SHR #C, we can deduce that it is really testing bit
+ // #C in the base of the SHR.
+ if (From->getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(From->getOperand(1))) {
+ APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+ assert(Shift.getLimitedValue() < 32 && "Shift too large!");
+ FromMask <<= Shift.getLimitedValue(31);
+ From = From->getOperand(0);
+ }
+
+ return From;
+}
+
+// If A and B contain one contiguous set of bits, does A | B == A . B?
+//
+// Neither A nor B must be zero.
+static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
+ unsigned LastActiveBitInA = A.countTrailingZeros();
+ unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
+ return LastActiveBitInA - 1 == FirstActiveBitInB;
+}
+
+static SDValue FindBFIToCombineWith(SDNode *N) {
+ // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+ // if one exists.
+ APInt ToMask, FromMask;
+ SDValue From = ParseBFI(N, ToMask, FromMask);
+ SDValue To = N->getOperand(0);
+
+ // Now check for a compatible BFI to merge with. We can pass through BFIs that
+ // aren't compatible, but not if they set the same bit in their destination as
+ // we do (or that of any BFI we're going to combine with).
+ SDValue V = To;
+ APInt CombinedToMask = ToMask;
+ while (V.getOpcode() == ARMISD::BFI) {
+ APInt NewToMask, NewFromMask;
+ SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+ if (NewFrom != From) {
+ // This BFI has a different base. Keep going.
+ CombinedToMask |= NewToMask;
+ V = V.getOperand(0);
+ continue;
+ }
+
+ // Do the written bits conflict with any we've seen so far?
+ if ((NewToMask & CombinedToMask).getBoolValue())
+ // Conflicting bits - bail out because going further is unsafe.
+ return SDValue();
+
+ // Are the new bits contiguous when combined with the old bits?
+ if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+ BitsProperlyConcatenate(FromMask, NewFromMask))
+ return V;
+ if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+ BitsProperlyConcatenate(NewFromMask, FromMask))
+ return V;
+
+ // We've seen a write to some bits, so track it.
+ CombinedToMask |= NewToMask;
+ // Keep going...
+ V = V.getOperand(0);
+ }
+
+ return SDValue();
+}
+
static SDValue PerformBFICombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() == ISD::AND) {
+ // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+ // the bits being cleared by the AND are not demanded by the BFI.
ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!N11C)
return SDValue();
@@ -8744,6 +9160,38 @@ static SDValue PerformBFICombine(SDNode *N,
return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
N->getOperand(0), N1.getOperand(0),
N->getOperand(2));
+ } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+ // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+ // Keep track of any consecutive bits set that all come from the same base
+ // value. We can combine these together into a single BFI.
+ SDValue CombineBFI = FindBFIToCombineWith(N);
+ if (CombineBFI == SDValue())
+ return SDValue();
+
+ // We've found a BFI.
+ APInt ToMask1, FromMask1;
+ SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
+
+ APInt ToMask2, FromMask2;
+ SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
+ assert(From1 == From2);
+ (void)From2;
+
+ // First, unlink CombineBFI.
+ DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+ // Then create a new BFI, combining the two together.
+ APInt NewFromMask = FromMask1 | FromMask2;
+ APInt NewToMask = ToMask1 | ToMask2;
+
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ if (NewFromMask[0] == 0)
+ From1 = DCI.DAG.getNode(
+ ISD::SRL, dl, VT, From1,
+ DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+ return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+ DCI.DAG.getConstant(~NewToMask, dl, VT));
}
return SDValue();
}
@@ -9521,32 +9969,6 @@ static SDValue PerformSTORECombine(SDNode *N,
return SDValue();
}
-// isConstVecPow2 - Return true if each vector element is a power of 2, all
-// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
-static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
-{
- integerPart cN;
- integerPart c0 = 0;
- for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
- I != E; I++) {
- ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
- if (!C)
- return false;
-
- bool isExact;
- APFloat APF = C->getValueAPF();
- if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
- != APFloat::opOK || !isExact)
- return false;
-
- c0 = (I == 0) ? cN : c0;
- if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
- return false;
- }
- C = c0;
- return true;
-}
-
/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
/// can replace combinations of VMUL and VCVT (floating-point to integer)
/// when the VMUL has a constant operand that is a power of 2.
@@ -9556,30 +9978,25 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
/// vcvt.s32.f32 d16, d16
/// becomes:
/// vcvt.s32.f32 d16, d16, #3
-static SDValue PerformVCVTCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
- SelectionDAG &DAG = DCI.DAG;
- SDValue Op = N->getOperand(0);
+ if (!Subtarget->hasNEON())
+ return SDValue();
- if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
- Op.getOpcode() != ISD::FMUL)
+ SDValue Op = N->getOperand(0);
+ if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
return SDValue();
- uint64_t C;
- SDValue N0 = Op->getOperand(0);
SDValue ConstVec = Op->getOperand(1);
- bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
-
- if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
- !isConstVecPow2(ConstVec, isSigned, C))
+ if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
- NumLanes > 4) {
+ if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
// These instructions only exist converting from f32 to i32. We can handle
// smaller integers by generating an extra truncate, but larger ones would
// be lossy. We also can't handle more then 4 lanes, since these intructions
@@ -9587,16 +10004,22 @@ static SDValue PerformVCVTCombine(SDNode *N,
return SDValue();
}
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+ if (C == -1 || C == 0 || C > 32)
+ return SDValue();
+
SDLoc dl(N);
+ bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
Intrinsic::arm_neon_vcvtfp2fxu;
- SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
- NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
- DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
- N0,
- DAG.getConstant(Log2_64(C), dl, MVT::i32));
+ SDValue FixConv = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+ DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
+ DAG.getConstant(C, dl, MVT::i32));
- if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+ if (IntBits < FloatBits)
FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
return FixConv;
@@ -9611,38 +10034,44 @@ static SDValue PerformVCVTCombine(SDNode *N,
/// vdiv.f32 d16, d17, d16
/// becomes:
/// vcvt.f32.s32 d16, d16, #3
-static SDValue PerformVDIVCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
- SelectionDAG &DAG = DCI.DAG;
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
SDValue Op = N->getOperand(0);
unsigned OpOpcode = Op.getNode()->getOpcode();
-
- if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
+ if (!N->getValueType(0).isVector() ||
(OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
return SDValue();
- uint64_t C;
SDValue ConstVec = N->getOperand(1);
- bool isSigned = OpOpcode == ISD::SINT_TO_FP;
-
- if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
- !isConstVecPow2(ConstVec, isSigned, C))
+ if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
- if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+ uint32_t IntBits = IntTy.getSizeInBits();
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
// These instructions only exist converting from i32 to f32. We can handle
// smaller integers by generating an extra extend, but larger ones would
- // be lossy.
+ // be lossy. We also can't handle more then 4 lanes, since these intructions
+ // only support v2i32/v4i32 types.
return SDValue();
}
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+ if (C == -1 || C == 0 || C > 32)
+ return SDValue();
+
SDLoc dl(N);
+ bool isSigned = OpOpcode == ISD::SINT_TO_FP;
SDValue ConvInput = Op.getOperand(0);
- unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+ if (IntBits < FloatBits)
ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
ConvInput);
@@ -9652,7 +10081,7 @@ static SDValue PerformVDIVCombine(SDNode *N,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
Op.getValueType(),
DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
- ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32));
+ ConvInput, DAG.getConstant(C, dl, MVT::i32));
}
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -9680,7 +10109,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (! getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
@@ -9695,12 +10124,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (! getVShiftImm(Op, ElementBits, Cnt))
return false;
- if (isIntrinsic)
+ if (!isIntrinsic)
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+ if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
Cnt = -Cnt;
- return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+ return true;
+ }
+ return false;
}
/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
@@ -9939,89 +10372,123 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
-/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
-static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
- // If the target supports NEON, try to use vmax/vmin instructions for f32
- // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set,
- // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is
- // a NaN; only do the transformation when it matches that behavior.
-
- // For now only do this when using NEON for FP operations; if using VFP, it
- // is not obvious that the benefit outweighs the cost of switching to the
- // NEON pipeline.
- if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
- N->getValueType(0) != MVT::f32)
+static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
+ APInt &KnownOne) {
+ if (Op.getOpcode() == ARMISD::BFI) {
+ // Conservatively, we can recurse down the first operand
+ // and just mask out all affected bits.
+ computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+
+ // The operand to BFI is already a mask suitable for removing the bits it
+ // sets.
+ ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
+ APInt Mask = CI->getAPIntValue();
+ KnownZero &= Mask;
+ KnownOne &= Mask;
+ return;
+ }
+ if (Op.getOpcode() == ARMISD::CMOV) {
+ APInt KZ2(KnownZero.getBitWidth(), 0);
+ APInt KO2(KnownOne.getBitWidth(), 0);
+ computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
+ computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+
+ KnownZero &= KZ2;
+ KnownOne &= KO2;
+ return;
+ }
+ return DAG.computeKnownBits(Op, KnownZero, KnownOne);
+}
+
+SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
+ // If we have a CMOV, OR and AND combination such as:
+ // if (x & CN)
+ // y |= CM;
+ //
+ // And:
+ // * CN is a single bit;
+ // * All bits covered by CM are known zero in y
+ //
+ // Then we can convert this into a sequence of BFI instructions. This will
+ // always be a win if CM is a single bit, will always be no worse than the
+ // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
+ // three bits (due to the extra IT instruction).
+
+ SDValue Op0 = CMOV->getOperand(0);
+ SDValue Op1 = CMOV->getOperand(1);
+ auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
+ auto CC = CCNode->getAPIntValue().getLimitedValue();
+ SDValue CmpZ = CMOV->getOperand(4);
+
+ // The compare must be against zero.
+ if (!isNullConstant(CmpZ->getOperand(1)))
+ return SDValue();
+
+ assert(CmpZ->getOpcode() == ARMISD::CMPZ);
+ SDValue And = CmpZ->getOperand(0);
+ if (And->getOpcode() != ISD::AND)
+ return SDValue();
+ ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
+ if (!AndC || !AndC->getAPIntValue().isPowerOf2())
return SDValue();
+ SDValue X = And->getOperand(0);
- SDValue CondLHS = N->getOperand(0);
- SDValue CondRHS = N->getOperand(1);
- SDValue LHS = N->getOperand(2);
- SDValue RHS = N->getOperand(3);
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
- unsigned Opcode = 0;
- bool IsReversed;
- if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
- IsReversed = false; // x CC y ? x : y
- } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
- IsReversed = true ; // x CC y ? y : x
+ if (CC == ARMCC::EQ) {
+ // We're performing an "equal to zero" compare. Swap the operands so we
+ // canonicalize on a "not equal to zero" compare.
+ std::swap(Op0, Op1);
} else {
- return SDValue();
+ assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
}
+
+ if (Op1->getOpcode() != ISD::OR)
+ return SDValue();
- bool IsUnordered;
- switch (CC) {
- default: break;
- case ISD::SETOLT:
- case ISD::SETOLE:
- case ISD::SETLT:
- case ISD::SETLE:
- case ISD::SETULT:
- case ISD::SETULE:
- // If LHS is NaN, an ordered comparison will be false and the result will
- // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS
- // != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
- IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
- if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
- break;
- // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
- // will return -0, so vmin can only be used for unsafe math or if one of
- // the operands is known to be nonzero.
- if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
- !DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
- break;
- Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
- break;
+ ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
+ if (!OrC)
+ return SDValue();
+ SDValue Y = Op1->getOperand(0);
- case ISD::SETOGT:
- case ISD::SETOGE:
- case ISD::SETGT:
- case ISD::SETGE:
- case ISD::SETUGT:
- case ISD::SETUGE:
- // If LHS is NaN, an ordered comparison will be false and the result will
- // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS
- // != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
- IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
- if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
- break;
- // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
- // will return +0, so vmax can only be used for unsafe math or if one of
- // the operands is known to be nonzero.
- if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
- !DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
- break;
- Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
- break;
- }
+ if (Op0 != Y)
+ return SDValue();
+
+ // Now, is it profitable to continue?
+ APInt OrCI = OrC->getAPIntValue();
+ unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
+ if (OrCI.countPopulation() > Heuristic)
+ return SDValue();
- if (!Opcode)
+ // Lastly, can we determine that the bits defined by OrCI
+ // are zero in Y?
+ APInt KnownZero, KnownOne;
+ computeKnownBits(DAG, Y, KnownZero, KnownOne);
+ if ((OrCI & KnownZero) != OrCI)
return SDValue();
- return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
+
+ // OK, we can do the combine.
+ SDValue V = Y;
+ SDLoc dl(X);
+ EVT VT = X.getValueType();
+ unsigned BitInX = AndC->getAPIntValue().logBase2();
+
+ if (BitInX != 0) {
+ // We must shift X first.
+ X = DAG.getNode(ISD::SRL, dl, VT, X,
+ DAG.getConstant(BitInX, dl, VT));
+ }
+
+ for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
+ BitInY < NumActiveBits; ++BitInY) {
+ if (OrCI[BitInY] == 0)
+ continue;
+ APInt Mask(VT.getSizeInBits(), 0);
+ Mask.setBit(BitInY);
+ V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
+ // Confusingly, the operand is an *inverted* mask.
+ DAG.getConstant(~Mask, dl, VT));
+ }
+
+ return V;
}
/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
@@ -10042,6 +10509,13 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
ARMCC::CondCodes CC =
(ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+ // BFI is only available on V6T2+.
+ if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
+ SDValue R = PerformCMOVToBFICombine(N, DAG);
+ if (R)
+ return R;
+ }
+
// Simplify
// mov r1, r0
// cmp r1, x
@@ -10108,8 +10582,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
- case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget);
+ case ISD::FP_TO_UINT:
+ return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+ case ISD::FDIV:
+ return PerformVDIVCombine(N, DCI.DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
case ISD::SHL:
case ISD::SRA:
@@ -10117,7 +10593,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
- case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);
case ARMISD::VLD2DUP:
@@ -11043,37 +11518,61 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
- unsigned Opcode = Op->getOpcode();
- assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
- "Invalid opcode for Div/Rem lowering");
- bool isSigned = (Opcode == ISD::SDIVREM);
- EVT VT = Op->getValueType(0);
- Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
+static RTLIB::Libcall getDivRemLibcall(
+ const SDNode *N, MVT::SimpleValueType SVT) {
+ assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+ N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
+ "Unhandled Opcode in getDivRemLibcall");
+ bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+ N->getOpcode() == ISD::SREM;
RTLIB::Libcall LC;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (SVT) {
default: llvm_unreachable("Unexpected request for libcall!");
case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
}
+ return LC;
+}
- SDValue InChain = DAG.getEntryNode();
-
+static TargetLowering::ArgListTy getDivRemArgList(
+ const SDNode *N, LLVMContext *Context) {
+ assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+ N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
+ "Unhandled Opcode in getDivRemArgList");
+ bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+ N->getOpcode() == ISD::SREM;
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
- for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
- EVT ArgVT = Op->getOperand(i).getValueType();
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- Entry.Node = Op->getOperand(i);
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = N->getOperand(i).getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*Context);
+ Entry.Node = N->getOperand(i);
Entry.Ty = ArgTy;
Entry.isSExt = isSigned;
Entry.isZExt = !isSigned;
Args.push_back(Entry);
}
+ return Args;
+}
+
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+ assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) &&
+ "Register-based DivRem lowering only");
+ unsigned Opcode = Op->getOpcode();
+ assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+ "Invalid opcode for Div/Rem lowering");
+ bool isSigned = (Opcode == ISD::SDIVREM);
+ EVT VT = Op->getValueType(0);
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+ RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
+ VT.getSimpleVT().SimpleTy);
+ SDValue InChain = DAG.getEntryNode();
+
+ TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
+ DAG.getContext());
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
@@ -11090,6 +11589,47 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
return CallInfo.first;
}
+// Lowers REM using divmod helpers
+// see RTABI section 4.2/4.3
+SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
+ // Build return types (div and rem)
+ std::vector<Type*> RetTyParams;
+ Type *RetTyElement;
+
+ switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
+ case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
+ case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
+ case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
+ }
+
+ RetTyParams.push_back(RetTyElement);
+ RetTyParams.push_back(RetTyElement);
+ ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
+ Type *RetTy = StructType::get(*DAG.getContext(), ret);
+
+ RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
+ SimpleTy);
+ SDValue InChain = DAG.getEntryNode();
+ TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext());
+ bool isSigned = N->getOpcode() == ISD::SREM;
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy(DAG.getDataLayout()));
+
+ // Lower call
+ CallLoweringInfo CLI(DAG);
+ CLI.setChain(InChain)
+ .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0)
+ .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+ // Return second (rem) result operand (first contains div)
+ SDNode *ResNode = CallResult.first.getNode();
+ assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
+ return ResNode->getOperand(1);
+}
+
SDValue
ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "unsupported target platform");
@@ -11124,8 +11664,8 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
- /*isSigned*/ false, SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -11137,8 +11677,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
- /*isSigned*/ false, SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
}
bool
@@ -11186,7 +11726,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
- uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
@@ -11212,7 +11752,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
- NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
@@ -11295,8 +11835,6 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; }
-
Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
ARM_MB::MemBOpt Domain) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -11392,19 +11930,26 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
// guarantee, see DDI0406C ARM architecture reference manual,
// sections A8.8.72-74 LDRD)
-bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
- return (Size == 64) && !Subtarget->isMClass();
+ return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
+ : AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldrex/strex up to 32 bits,
// and up to 64 bits on the non-M profiles
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
return (Size <= (Subtarget->isMClass() ? 32U : 64U))
- ? AtomicRMWExpansionKind::LLSC
- : AtomicRMWExpansionKind::None;
+ ? AtomicExpansionKind::LLSC
+ : AtomicExpansionKind::None;
+}
+
+bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
+ return true;
}
// This has so far only been implemented for MachO.
@@ -11419,7 +11964,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
// Floating point values and vector values map to the same register file.
- // Therefore, althought we could do a store extract of a vector type, this is
+ // Therefore, although we could do a store extract of a vector type, this is
// better to leave at float as we have more freedom in the addressing mode for
// those.
if (VectorTy->isFPOrFPVectorTy())
@@ -11441,6 +11986,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
}
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+ return Subtarget->hasV6T2Ops();
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+ return Subtarget->hasV6T2Ops();
+}
+
Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -11477,6 +12030,14 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
cast<PointerType>(Addr->getType())->getElementType());
}
+void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+ IRBuilder<> &Builder) const {
+ if (!Subtarget->hasV7Ops())
+ return;
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+}
+
Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr,
AtomicOrdering Ord) const {
@@ -11534,12 +12095,12 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Type *EltTy = VecTy->getVectorElementType();
const DataLayout &DL = LI->getModule()->getDataLayout();
- unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
- bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
- // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
- // support i64/f64 element).
- if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
+ // Skip if we do not have NEON and skip illegal vector types and vector types
+ // with i64/f64 elements (vldN doesn't support i64/f64 elements).
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
return false;
// A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -11552,9 +12113,6 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Intrinsic::arm_neon_vld3,
Intrinsic::arm_neon_vld4};
- Function *VldnFunc =
- Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
-
IRBuilder<> Builder(LI);
SmallVector<Value *, 2> Ops;
@@ -11562,6 +12120,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
Ops.push_back(Builder.getInt32(LI->getAlignment()));
+ Type *Tys[] = { VecTy, Int8Ptr };
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
// Replace uses of each shufflevector with the corresponding vector loaded
@@ -11624,12 +12185,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
const DataLayout &DL = SI->getModule()->getDataLayout();
- unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
- bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
- // Skip illegal sub vector types and vector types of i64/f64 element (vstN
- // doesn't support i64/f64 element).
- if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
+ // Skip if we do not have NEON and skip illegal vector types and vector types
+ // with i64/f64 elements (vstN doesn't support i64/f64 elements).
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
+ EltIs64Bits)
return false;
Value *Op0 = SVI->getOperand(0);
@@ -11650,17 +12212,18 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SubVecTy = VectorType::get(IntTy, NumSubElts);
}
- static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
- Intrinsic::arm_neon_vst3,
- Intrinsic::arm_neon_vst4};
- Function *VstNFunc = Intrinsic::getDeclaration(
- SI->getModule(), StoreInts[Factor - 2], SubVecTy);
-
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
SmallVector<Value *, 6> Ops;
Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+ Type *Tys[] = { Int8Ptr, SubVecTy };
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], Tys);
+
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++)
Ops.push_back(Builder.CreateShuffleVector(
@@ -11681,14 +12244,14 @@ enum HABaseType {
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
uint64_t &Members) {
- if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+ if (auto *ST = dyn_cast<StructType>(Ty)) {
for (unsigned i = 0; i < ST->getNumElements(); ++i) {
uint64_t SubMembers = 0;
if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
return false;
Members += SubMembers;
}
- } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+ } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
uint64_t SubMembers = 0;
if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
return false;
@@ -11703,7 +12266,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
return false;
Members = 1;
Base = HA_DOUBLE;
- } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
Members = 1;
switch (Base) {
case HA_FLOAT:
@@ -11747,3 +12310,17 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
return IsHA || IsIntArray;
}
+
+unsigned ARMTargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ // Platforms which do not use SjLj EH may return values in these registers
+ // via the personality function.
+ return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+}
+
+unsigned ARMTargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Platforms which do not use SjLj EH may return values in these registers
+ // via the personality function.
+ return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index efc9020c193a..b764624f1492 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -63,8 +63,6 @@ namespace llvm {
BCC_i64,
- RBIT, // ARM bitreverse instruction
-
SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag.
@@ -79,6 +77,7 @@ namespace llvm {
EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
+ EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
TC_RETURN, // Tail call return pseudo.
@@ -91,6 +90,7 @@ namespace llvm {
PRELOAD, // Preload
WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
+ WIN__DBZCHK, // Windows' divide by zero check
VCEQ, // Vector compare equal.
VCEQZ, // Vector compare equal to zero.
@@ -172,12 +172,6 @@ namespace llvm {
// BUILD_VECTOR for this purpose.
BUILD_VECTOR,
- // Floating-point max and min:
- FMAX,
- FMIN,
- VMAXNM,
- VMINNM,
-
// Bit-field insert
BFI,
@@ -189,6 +183,10 @@ namespace llvm {
// Vector bitwise select
VBSL,
+ // Pseudo-instruction representing a memory copy using ldm/stm
+ // instructions.
+ MEMCPY,
+
// Vector load N-element structure to all lanes:
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
VLD3DUP,
@@ -260,6 +258,7 @@ namespace llvm {
SDNode *Node) const override;
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+ SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
@@ -348,6 +347,8 @@ namespace llvm {
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
if (ConstraintCode == "Q")
return InlineAsm::Constraint_Q;
+ else if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
else if (ConstraintCode.size() == 2) {
if (ConstraintCode[0] == 'U') {
switch(ConstraintCode[1]) {
@@ -420,13 +421,24 @@ namespace llvm {
bool functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
- bool hasLoadLinkedStoreConditional() const override;
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
bool IsStore, bool IsLoad) const override;
Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
@@ -441,16 +453,21 @@ namespace llvm {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- TargetLoweringBase::AtomicRMWExpansionKind
+ TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
bool useLoadStackGuardNode() const override;
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -496,6 +513,7 @@ namespace llvm {
ISD::ArgFlagsTy Flags) const;
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -508,7 +526,6 @@ namespace llvm {
SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
SelectionDAG &DAG,
TLSModel::Model model) const;
- SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -526,6 +543,12 @@ namespace llvm {
const ARMSubtarget *ST) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
+ SmallVectorImpl<SDValue> &Results) const;
+ SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
+ SDValue &Chain) const;
+ SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
@@ -635,6 +658,8 @@ namespace llvm {
MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
};
enum NEONModImmType {
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 84f95be30991..cf973d68085f 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -51,7 +51,8 @@ void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
switch (Opc) {
- default: break;
+ default:
+ break;
case ARM::LDR_PRE_IMM:
case ARM::LDR_PRE_REG:
case ARM::LDR_POST_IMM:
@@ -124,82 +125,10 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
.addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
- MachinePointerInfo::getGOT(), Flag, 4, 4);
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
MIB.addMemOperand(MMO);
MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
MIB.addReg(Reg, RegState::Kill).addImm(0);
MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
AddDefaultPred(MIB);
}
-
-namespace {
- /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
- /// global base register for ARM ELF.
- struct ARMCGBR : public MachineFunctionPass {
- static char ID;
- ARMCGBR() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override {
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
- if (AFI->getGlobalBaseReg() == 0)
- return false;
- const ARMSubtarget &STI =
- static_cast<const ARMSubtarget &>(MF.getSubtarget());
- // Don't do this for Thumb1.
- if (STI.isThumb1Only())
- return false;
-
- const TargetMachine &TM = MF.getTarget();
- if (TM.getRelocationModel() != Reloc::PIC_)
- return false;
-
- LLVMContext *Context = &MF.getFunction()->getContext();
- unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
- unsigned PCAdj = STI.isThumb() ? 4 : 8;
- ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
- *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
-
- unsigned Align = TM.getDataLayout()->getPrefTypeAlignment(
- Type::getInt32PtrTy(*Context));
- unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
-
- MachineBasicBlock &FirstMBB = MF.front();
- MachineBasicBlock::iterator MBBI = FirstMBB.begin();
- DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
- unsigned TempReg =
- MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
- unsigned Opc = STI.isThumb2() ? ARM::t2LDRpci : ARM::LDRcp;
- const TargetInstrInfo &TII = *STI.getInstrInfo();
- MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
- TII.get(Opc), TempReg)
- .addConstantPoolIndex(Idx);
- if (Opc == ARM::LDRcp)
- MIB.addImm(0);
- AddDefaultPred(MIB);
-
- // Fix the GOT address by adding pc.
- unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
- Opc = STI.isThumb2() ? ARM::tPICADD : ARM::PICADD;
- MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
- .addReg(TempReg)
- .addImm(ARMPCLabelIndex);
- if (Opc == ARM::PICADD)
- AddDefaultPred(MIB);
-
- return true;
- }
-
- const char *getPassName() const override {
- return "ARM PIC Global Base Reg Initialization";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
- };
-}
-
-char ARMCGBR::ID = 0;
-FunctionPass*
-llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 61c45af26fe1..b9de83bfe6dc 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -59,6 +59,7 @@ def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
SDTCisInt<2>]>;
def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>;
def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
@@ -70,8 +71,11 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
-def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+ SDTCisVT<4, i32>]>;
def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
@@ -163,21 +167,23 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
SDT_ARMEH_SJLJ_Longjmp,
[SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
+ SDT_ARMEH_SJLJ_SetupDispatch,
+ [SDNPHasChain, SDNPSideEffect]>;
def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
[SDNPHasChain, SDNPSideEffect]>;
def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
[SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
-def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
-
def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
-def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
-def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
+ [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+ SDNPMayStore, SDNPMayLoad]>;
//===----------------------------------------------------------------------===//
// ARM Instruction Predicate Definitions.
@@ -209,6 +215,8 @@ def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -228,7 +236,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">,
- AssemblerPredicate<"FeatureFP16","half-float">;
+ AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16","full half-float">;
def HasDivide : Predicate<"Subtarget->hasDivide()">,
AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
@@ -236,9 +246,8 @@ def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
AssemblerPredicate<"FeatureT2XtPk",
"pack/extract">;
-def HasThumb2DSP : Predicate<"Subtarget->hasThumb2DSP()">,
- AssemblerPredicate<"FeatureDSPThumb2",
- "thumb2-dsp">;
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+ AssemblerPredicate<"FeatureDSP", "dsp">;
def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
AssemblerPredicate<"FeatureDB",
"data-barriers">;
@@ -2322,6 +2331,7 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
let Inst{23-4} = 0b01100000000000000111;
let Inst{3-0} = opt;
}
+def : MnemonicAlias<"smi", "smc">;
// Supervisor Call (Software Interrupt)
let isCall = 1, Uses = [SP] in {
@@ -3671,10 +3681,10 @@ def USAT16 : AI<(outs GPRnopc:$Rd),
let Inst{3-0} = Rn;
}
-def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
- (SSAT imm:$pos, GPRnopc:$a, 0)>;
-def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
- (USAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
+ (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
//===----------------------------------------------------------------------===//
// Bitwise Instructions.
@@ -4186,7 +4196,7 @@ def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
IIC_iUNAr, "rbit", "\t$Rd, $Rm",
- [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>,
+ [(set GPR:$Rd, (bitreverse GPR:$Rm))]>,
Requires<[IsARM, HasV6T2]>,
Sched<[WriteALU]>;
@@ -4578,6 +4588,19 @@ let usesCustomInserter = 1 in {
[(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
}
+let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in {
+ // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs...
+ // Copies N registers worth of memory from address %src to address %dst
+ // and returns the incremented addresses. N scratch register will
+ // be attached for the copy to use.
+ def MEMCPY : PseudoInst<
+ (outs GPR:$newdst, GPR:$newsrc),
+ (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops),
+ NoItinerary,
+ [(set GPR:$newdst, GPR:$newsrc,
+ (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
+}
+
def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
}]>;
@@ -4705,7 +4728,7 @@ def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
[(int_arm_clrex)]>,
- Requires<[IsARM, HasV7]> {
+ Requires<[IsARM, HasV6K]> {
let Inst{31-0} = 0b11110101011111111111000000011111;
}
@@ -5242,6 +5265,12 @@ def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
+ [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+let usesCustomInserter = 1, Defs = [CPSR] in
+ def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary,
+ [(win__dbzchk GPR:$divisor)]>;
+
//===----------------------------------------------------------------------===//
// TLS Instructions
//
@@ -5301,6 +5330,10 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
Requires<[IsARM]>;
}
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary,
+ [(ARMeh_sjlj_setup_dispatch)]>;
+
// eh.sjlj.dispatchsetup pseudo-instruction.
// This pseudo is used for both ARM and Thumb. Any differences are handled when
// the pseudo is expanded (which happens before any passes that need the
@@ -5622,16 +5655,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
(MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
// Same for AND <--> BIC
def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
- (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+ (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
pred:$p, cc_out:$s)>;
def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
- (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+ (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
pred:$p, cc_out:$s)>;
def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
- (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+ (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
pred:$p, cc_out:$s)>;
def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
- (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+ (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
pred:$p, cc_out:$s)>;
// Likewise, "add Rd, mod_imm_neg" -> sub
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index f035d6150ec0..7020ffb41b64 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -587,11 +587,6 @@ def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
-def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>]>;
-def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
-def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
-
def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
@@ -2465,17 +2460,17 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
[(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
// Same as above, but not predicated.
-class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
InstrItinClass itin, string OpcodeStr, string Dt,
ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
- : N2Vnp<0b10, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+ : N2Vnp<op19_18, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm),
itin, OpcodeStr, Dt,
[(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
-class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
InstrItinClass itin, string OpcodeStr, string Dt,
ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
- : N2Vnp<0b10, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm),
+ : N2Vnp<op19_18, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm),
itin, OpcodeStr, Dt,
[(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -3255,6 +3250,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
[(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
let Inst{10} = 1; // overwrite F = 1
}
+ def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+ opc, "f16", asm, "",
+ [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+ Requires<[HasNEON,HasFullFP16]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
// 128-bit vector types.
def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
@@ -3275,6 +3277,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
[(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
let Inst{10} = 1; // overwrite F = 1
}
+ def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+ opc, "f16", asm, "",
+ [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+ Requires<[HasNEON,HasFullFP16]> {
+ let Inst{10} = 1; // overwrite F = 1
+ }
}
@@ -4110,6 +4119,12 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
v2f32, v2f32, fadd, 1>;
def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
v4f32, v4f32, fadd, 1>;
+def VADDhd : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16",
+ v4f16, v4f16, fadd, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
+ v8f16, v8f16, fadd, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
// VADDL : Vector Add Long (Q = D + D)
defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "s", add, sext, 1>;
@@ -4165,10 +4180,21 @@ def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
v2f32, v2f32, fmul, 1>;
def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
v4f32, v4f32, fmul, 1>;
+def VMULhd : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16",
+ v4f16, v4f16, fmul, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VMULhq : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16",
+ v8f16, v8f16, fmul, 1>,
+ Requires<[HasNEON,HasFullFP16]>;
defm VMULsl : N3VSL_HS<0b1000, "vmul", mul>;
def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
v2f32, fmul>;
+def VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16,
+ v4f16, fmul>,
+ Requires<[HasNEON,HasFullFP16]>;
def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
(v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
@@ -4277,6 +4303,12 @@ def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
v4f32, fmul_su, fadd_mlx>,
Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
+ v4f16, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
+ v8f16, fmul_su, fadd_mlx>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4285,6 +4317,12 @@ def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
v4f32, v2f32, fmul_su, fadd_mlx>,
Requires<[HasNEON, UseFPVMLx]>;
+def VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16",
+ v4f16, fmul, fadd>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16",
+ v8f16, v4f16, fmul, fadd>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
@@ -4495,6 +4533,12 @@ def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
v4f32, fmul_su, fsub_mlx>,
Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
+ v8f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -4503,6 +4547,12 @@ def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
v4f32, v2f32, fmul_su, fsub_mlx>,
Requires<[HasNEON, UseFPVMLx]>;
+def VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16",
+ v8f16, v4f16, fmul, fsub>,
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
@@ -4570,6 +4620,13 @@ def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
v4f32, fmul_su, fadd_mlx>,
Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def VFMAhd : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16",
+ v4f16, fmul, fadd>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+def VFMAhq : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16",
+ v8f16, fmul, fadd>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
// Fused Vector Multiply Subtract (floating-point)
def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
@@ -4578,6 +4635,12 @@ def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
v4f32, fmul_su, fsub_mlx>,
Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def VFMShd : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16",
+ v4f16, fmul, fsub>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16",
+ v8f16, fmul, fsub>,
+ Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
@@ -4602,6 +4665,12 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
v2f32, v2f32, fsub, 0>;
def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
v4f32, v4f32, fsub, 0>;
+def VSUBhd : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16",
+ v4f16, v4f16, fsub, 0>,
+ Requires<[HasNEON,HasFullFP16]>;
+def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
+ v8f16, v8f16, fsub, 0>,
+ Requires<[HasNEON,HasFullFP16]>;
// VSUBL : Vector Subtract Long (Q = D - D)
defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "s", sub, sext, 0>;
@@ -4646,6 +4715,12 @@ def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
NEONvceq, 1>;
def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
NEONvceq, 1>;
+def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+ NEONvceq, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+ NEONvceq, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in
defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
@@ -4660,6 +4735,12 @@ def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
NEONvcge, 0>;
def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
NEONvcge, 0>;
+def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+ NEONvcge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+ NEONvcge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
@@ -4677,6 +4758,12 @@ def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
NEONvcgt, 0>;
def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
NEONvcgt, 0>;
+def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+ NEONvcgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+ NEONvcgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
@@ -4686,36 +4773,68 @@ defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
}
// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+def VACGEfd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
"f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
-def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+def VACGEfq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
"f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
+def VACGEhd : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+ "f16", v4i16, v4f16, int_arm_neon_vacge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VACGEhq : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+ "f16", v8i16, v8f16, int_arm_neon_vacge, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
// VACGT : Vector Absolute Compare Greater Than (aka VCAGT)
-def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+def VACGTfd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
"f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
-def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+def VACGTfq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
"f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
+def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+ "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+ "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
// VTST : Vector Test Bits
defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
- (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+ (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
- (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+ (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
- (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+ (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
- (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+ (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+ (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+ (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+ (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+ (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+}
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
- (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+ (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
- (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+ (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
- (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+ (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
- (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+ (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+ (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+ (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+ (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+ (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+}
// Vector Bitwise Operations.
@@ -5007,6 +5126,12 @@ def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
"vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
"vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+def VABDhd : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND,
+ "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+ "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
// VABDL : Vector Absolute Difference Long (Q = | D - D |)
defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
@@ -5014,6 +5139,29 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
"vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+def abd_shr :
+ PatFrag<(ops node:$in1, node:$in2, node:$shift),
+ (NEONvshrs (sub (zext node:$in1),
+ (zext node:$in2)), (i32 $shift))>;
+
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
+ (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
+ (zext (v8i8 DPR:$opB))),
+ (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
+ (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
+ (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
+ (zext (v4i16 DPR:$opB))),
+ (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
+ (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
+ (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+ (zext (v2i32 DPR:$opB))),
+ (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+ (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
+
// VABA : Vector Absolute Difference and Accumulate
defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
"vaba", "s", int_arm_neon_vabds, add>;
@@ -5031,53 +5179,85 @@ defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
// VMAX : Vector Maximum
defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vmax", "s", int_arm_neon_vmaxs, 1>;
+ "vmax", "s", smax, 1>;
defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vmax", "u", int_arm_neon_vmaxu, 1>;
+ "vmax", "u", umax, 1>;
def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmax", "f32",
- v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+ v2f32, v2f32, fmaxnan, 1>;
def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmax", "f32",
- v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+ v4f32, v4f32, fmaxnan, 1>;
+def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmax", "f16",
+ v4f16, v4f16, fmaxnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmax", "f16",
+ v8f16, v8f16, fmaxnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
// VMAXNM
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def VMAXNMND : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+ def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
N3RegFrm, NoItinerary, "vmaxnm", "f32",
- v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+ v2f32, v2f32, fmaxnum, 1>,
Requires<[HasV8, HasNEON]>;
- def VMAXNMNQ : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+ def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
N3RegFrm, NoItinerary, "vmaxnm", "f32",
- v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+ v4f32, v4f32, fmaxnum, 1>,
Requires<[HasV8, HasNEON]>;
+ def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v4f16, v4f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vmaxnm", "f16",
+ v8f16, v8f16, fmaxnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
}
// VMIN : Vector Minimum
defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vmin", "s", int_arm_neon_vmins, 1>;
+ "vmin", "s", smin, 1>;
defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vmin", "u", int_arm_neon_vminu, 1>;
+ "vmin", "u", umin, 1>;
def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmin", "f32",
- v2f32, v2f32, int_arm_neon_vmins, 1>;
+ v2f32, v2f32, fminnan, 1>;
def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmin", "f32",
- v4f32, v4f32, int_arm_neon_vmins, 1>;
+ v4f32, v4f32, fminnan, 1>;
+def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
+ "vmin", "f16",
+ v4f16, v4f16, fminnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+ "vmin", "f16",
+ v8f16, v8f16, fminnan, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
// VMINNM
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def VMINNMND : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+ def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
N3RegFrm, NoItinerary, "vminnm", "f32",
- v2f32, v2f32, int_arm_neon_vminnm, 1>,
+ v2f32, v2f32, fminnum, 1>,
Requires<[HasV8, HasNEON]>;
- def VMINNMNQ : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+ def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
N3RegFrm, NoItinerary, "vminnm", "f32",
- v4f32, v4f32, int_arm_neon_vminnm, 1>,
+ v4f32, v4f32, fminnum, 1>,
Requires<[HasV8, HasNEON]>;
+ def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v4f16, v4f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
+ N3RegFrm, NoItinerary, "vminnm", "f16",
+ v8f16, v8f16, fminnum, 1>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
}
// Vector Pairwise Operations.
@@ -5095,6 +5275,10 @@ def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
IIC_VPBIND, "vpadd", "f32",
v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def VPADDh : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm,
+ IIC_VPBIND, "vpadd", "f16",
+ v4f16, v4f16, int_arm_neon_vpadd, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
// VPADDL : Vector Pairwise Add Long
defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
@@ -5123,6 +5307,9 @@ def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
"u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
"f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def VPMAXh : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+ "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
// VPMIN : Vector Pairwise Minimum
def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
@@ -5139,6 +5326,9 @@ def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
"u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
"f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def VPMINh : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+ "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>,
+ Requires<[HasNEON, HasFullFP16]>;
// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
@@ -5155,6 +5345,14 @@ def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
IIC_VUNAQ, "vrecpe", "f32",
v4f32, v4f32, int_arm_neon_vrecpe>;
+def VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+ IIC_VUNAD, "vrecpe", "f16",
+ v4f16, v4f16, int_arm_neon_vrecpe>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+ IIC_VUNAQ, "vrecpe", "f16",
+ v8f16, v8f16, int_arm_neon_vrecpe>,
+ Requires<[HasNEON, HasFullFP16]>;
// VRECPS : Vector Reciprocal Step
def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
@@ -5163,6 +5361,14 @@ def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
IIC_VRECSQ, "vrecps", "f32",
v4f32, v4f32, int_arm_neon_vrecps, 1>;
+def VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrecps", "f16",
+ v4f16, v4f16, int_arm_neon_vrecps, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrecps", "f16",
+ v8f16, v8f16, int_arm_neon_vrecps, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
// VRSQRTE : Vector Reciprocal Square Root Estimate
def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
@@ -5177,6 +5383,14 @@ def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
IIC_VUNAQ, "vrsqrte", "f32",
v4f32, v4f32, int_arm_neon_vrsqrte>;
+def VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+ IIC_VUNAD, "vrsqrte", "f16",
+ v4f16, v4f16, int_arm_neon_vrsqrte>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+ IIC_VUNAQ, "vrsqrte", "f16",
+ v8f16, v8f16, int_arm_neon_vrsqrte>,
+ Requires<[HasNEON, HasFullFP16]>;
// VRSQRTS : Vector Reciprocal Square Root Step
def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
@@ -5185,6 +5399,14 @@ def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
IIC_VRECSQ, "vrsqrts", "f32",
v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+ IIC_VRECSD, "vrsqrts", "f16",
+ v4f16, v4f16, int_arm_neon_vrsqrts, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+ IIC_VRECSQ, "vrsqrts", "f16",
+ v8f16, v8f16, int_arm_neon_vrsqrts, 1>,
+ Requires<[HasNEON, HasFullFP16]>;
// Vector Shifts.
@@ -5336,6 +5558,14 @@ def VABSfd : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
"vabs", "f32",
v4f32, v4f32, fabs>;
+def VABShd : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+ "vabs", "f16",
+ v4f16, v4f16, fabs>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VABShq : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+ "vabs", "f16",
+ v8f16, v8f16, fabs>,
+ Requires<[HasNEON, HasFullFP16]>;
def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
(v2i32 (bitconvert (v8i8 (add DPR:$src,
@@ -5398,6 +5628,16 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
(outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
"vneg", "f32", "$Vd, $Vm", "",
[(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+def VNEGhd : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0,
+ (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+ "vneg", "f16", "$Vd, $Vm", "",
+ [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0,
+ (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+ "vneg", "f16", "$Vd, $Vm", "",
+ [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>,
+ Requires<[HasNEON, HasFullFP16]>;
def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>;
def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>;
@@ -5868,18 +6108,56 @@ def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
v4f32, v4i32, uint_to_fp>;
+def VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+ v4i16, v4f16, fp_to_sint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+ v4i16, v4f16, fp_to_uint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+ v4f16, v4i16, sint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+ v4f16, v4i16, uint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+
+def VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+ v8i16, v8f16, fp_to_sint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+ v8i16, v8f16, fp_to_uint>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+ v8f16, v8i16, sint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+def VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+ v8f16, v8i16, uint_to_fp>,
+ Requires<[HasNEON, HasFullFP16]>;
+
// VCVT{A, N, P, M}
multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
SDPatternOperator IntU> {
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
"s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
- def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
"s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
- def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
"u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
- def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
"u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+ def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s16.f16", v4i16, v4f16, IntS>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+ "s16.f16", v8i16, v8f16, IntS>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u16.f16", v4i16, v4f16, IntU>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
+ def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+ "u16.f16", v8i16, v8f16, IntU>,
+ Requires<[HasV8, HasNEON, HasFullFP16]>;
}
}
@@ -5898,6 +6176,16 @@ def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+ v4i16, v4f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+ v4i16, v4f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+ v4f16, v4i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+ v4f16, v4i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
}
let DecoderMethod = "DecodeVCVTQ" in {
@@ -5909,6 +6197,16 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+ v8i16, v8f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+ v8i16, v8f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+ v8f16, v8i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+ v8f16, v8i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
}
def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
@@ -5929,6 +6227,24 @@ def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
(VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0",
+ (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0",
+ (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0",
+ (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0",
+ (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0",
+ (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0",
+ (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0",
+ (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0",
+ (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
// VCVT : Vector Convert Between Half-Precision and Single-Precision.
def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
@@ -6182,22 +6498,40 @@ def VTBX4Pseudo
// VRINT : Vector Rounding
multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
- def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+ def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
!strconcat("vrint", op), "f32",
v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
let Inst{9-7} = op9_7;
}
- def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+ def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
!strconcat("vrint", op), "f32",
v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
let Inst{9-7} = op9_7;
}
+ def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f16",
+ v4f16, v4f16, Int>,
+ Requires<[HasV8, HasNEON, HasFullFP16]> {
+ let Inst{9-7} = op9_7;
+ }
+ def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+ !strconcat("vrint", op), "f16",
+ v8f16, v8f16, Int>,
+ Requires<[HasV8, HasNEON, HasFullFP16]> {
+ let Inst{9-7} = op9_7;
+ }
}
def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
- (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+ (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>;
def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
- (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+ (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"),
+ (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>;
+ def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"),
+ (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>;
+ }
}
defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
@@ -6343,8 +6677,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
def : N2VSPat<fabs, VABSfd>;
def : N2VSPat<fneg, VNEGfd>;
-def : N3VSPat<NEONfmax, VMAXfd>;
-def : N3VSPat<NEONfmin, VMINfd>;
+def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
@@ -7704,6 +8038,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
(VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
(VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm",
+ (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
// Q-register versions.
def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
(VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
@@ -7719,6 +8056,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
(VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
(VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm",
+ (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
// D-register versions.
@@ -7736,6 +8076,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
(VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
(VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm",
+ (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
// Q-register versions.
def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
(VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
@@ -7751,6 +8094,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
(VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
(VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm",
+ (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
// VSWP allows, but does not require, a type suffix.
defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 40414da3ca81..df6f24306354 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -591,6 +591,34 @@ def tTRAP : TI<(outs), (ins), IIC_Br,
// Load Store Instructions.
//
+// PC-relative loads need to be matched first as constant pool accesses need to
+// always be PC-relative. We do this using AddedComplexity, as the pattern is
+// simpler than the patterns of the other load instructions.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+ "ldr", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+ T1Encoding<{0,1,0,0,1,?}> {
+ // A6.2 & A8.6.59
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-8} = Rt;
+ let Inst{7-0} = addr;
+}
+
+// SP-relative loads should be matched before standard immediate-offset loads as
+// it means we avoid having to move SP to another register.
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+ "ldr", "\t$Rt, $addr",
+ [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+ T1LdStSP<{1,?,?}> {
+ bits<3> Rt;
+ bits<8> addr;
+ let Inst{10-8} = Rt;
+ let Inst{7-0} = addr;
+}
+
// Loads: reg/reg and reg/imm5
let canFoldAsLoad = 1, isReMaterializable = 1 in
multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
@@ -598,16 +626,20 @@ multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
AddrMode am, InstrItinClass itin_r,
InstrItinClass itin_i, string asm,
PatFrag opnode> {
- def r : // reg/reg
- T1pILdStEncode<reg_opc,
- (outs tGPR:$Rt), (ins AddrMode_r:$addr),
- am, itin_r, asm, "\t$Rt, $addr",
- [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+ // Immediate-offset loads should be matched before register-offset loads as
+ // when the offset is a constant it's simpler to first check if it fits in the
+ // immediate offset field then fall back to register-offset if it doesn't.
def i : // reg/imm5
T1pILdStEncodeImm<imm_opc, 1 /* Load */,
(outs tGPR:$Rt), (ins AddrMode_i:$addr),
am, itin_i, asm, "\t$Rt, $addr",
[(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+ // Register-offset loads are matched last.
+ def r : // reg/reg
+ T1pILdStEncode<reg_opc,
+ (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+ am, itin_r, asm, "\t$Rt, $addr",
+ [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
}
// Stores: reg/reg and reg/imm5
multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
@@ -615,32 +647,32 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
AddrMode am, InstrItinClass itin_r,
InstrItinClass itin_i, string asm,
PatFrag opnode> {
- def r : // reg/reg
- T1pILdStEncode<reg_opc,
- (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
- am, itin_r, asm, "\t$Rt, $addr",
- [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
def i : // reg/imm5
T1pILdStEncodeImm<imm_opc, 0 /* Store */,
(outs), (ins tGPR:$Rt, AddrMode_i:$addr),
am, itin_i, asm, "\t$Rt, $addr",
[(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+ def r : // reg/reg
+ T1pILdStEncode<reg_opc,
+ (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+ am, itin_r, asm, "\t$Rt, $addr",
+ [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
}
// A8.6.57 & A8.6.60
-defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4,
+defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iLoad_r, IIC_iLoad_i, "ldr",
UnOpFrag<(load node:$Src)>>;
// A8.6.64 & A8.6.61
-defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1,
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
UnOpFrag<(zextloadi8 node:$Src)>>;
// A8.6.76 & A8.6.73
-defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
UnOpFrag<(zextloadi16 node:$Src)>>;
@@ -659,58 +691,36 @@ def tLDRSH : // A8.6.84
"ldrsh", "\t$Rt, $addr",
[(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
-let canFoldAsLoad = 1 in
-def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
- "ldr", "\t$Rt, $addr",
- [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
- T1LdStSP<{1,?,?}> {
- bits<3> Rt;
- bits<8> addr;
- let Inst{10-8} = Rt;
- let Inst{7-0} = addr;
-}
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
- "ldr", "\t$Rt, $addr",
- [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
- T1Encoding<{0,1,0,0,1,?}> {
- // A6.2 & A8.6.59
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+ "str", "\t$Rt, $addr",
+ [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+ T1LdStSP<{0,?,?}> {
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
- let Inst{7-0} = addr;
+ let Inst{7-0} = addr;
}
// A8.6.194 & A8.6.192
-defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
+defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iStore_r, IIC_iStore_i, "str",
BinOpFrag<(store node:$LHS, node:$RHS)>>;
// A8.6.197 & A8.6.195
-defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
// A8.6.207 & A8.6.205
-defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
-def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
- "str", "\t$Rt, $addr",
- [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
- T1LdStSP<{0,?,?}> {
- bits<3> Rt;
- bits<8> addr;
- let Inst{10-8} = Rt;
- let Inst{7-0} = addr;
-}
-
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
@@ -730,6 +740,7 @@ def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
// Writeback version is just a pseudo, as there's no encoding difference.
// Writeback happens iff the base register is not in the destination register
// list.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
def tLDMIA_UPD :
InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
"$Rn = $wb", IIC_iLoad_mu>,
@@ -1328,16 +1339,16 @@ def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs),
(tSUBrr tGPR:$lhs, tGPR:$rhs)>;
// Bswap 16 with load/store
-def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)),
- (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>;
def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
(tREV16 (tLDRHi t_addrmode_is2:$addr))>;
-def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
- t_addrmode_rrs2:$addr),
- (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
+ (tREV16 (tLDRHr t_addrmode_rr:$addr))>;
def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
t_addrmode_is2:$addr),
(tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+ t_addrmode_rr:$addr),
+ (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;
// ConstantPool
def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;
@@ -1372,10 +1383,10 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
Requires<[IsThumb, HasV5T]>;
// zextload i1 -> zextload i8
-def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
- (tLDRBr t_addrmode_rrs1:$addr)>;
def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
(tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
+ (tLDRBr t_addrmode_rr:$addr)>;
// extload from the stack -> word load from the stack, as it avoids having to
// materialize the base in a separate register. This only works when a word
@@ -1389,61 +1400,61 @@ def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
Requires<[IsThumb, IsThumb1Only, IsLE]>;
// extload -> zextload
-def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
-def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
-def : T1Pat<(extloadi8 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
-def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
-def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>;
-def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>;
// If it's impossible to use [r,r] address mode for sextload, select to
// ldr{b|h} + sxt{b|h} instead.
def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
(tSXTB (tLDRBi t_addrmode_is1:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
- (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>,
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+ (tSXTB (tLDRBr t_addrmode_rr:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
(tSXTH (tLDRHi t_addrmode_is2:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
- (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>,
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+ (tSXTH (tLDRHr t_addrmode_rr:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
-def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
- (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>;
def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
(tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
-def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
- (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+ (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
(tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+ (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;
def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
(tLDRBi t_addrmode_is1:$src)>;
-def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
- (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
+ (tLDRBr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
(tLDRHi t_addrmode_is2:$src)>;
-def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
- (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
+ (tLDRHr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
(tLDRi t_addrmode_is4:$src)>;
-def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
- (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
+ (tLDRr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
(tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
-def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
- (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
(tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
-def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
- (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
(tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
-def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
- (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
+ (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;
// Large immediate handling.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index aba8a7b10fd9..d460d33fa0a3 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -43,7 +43,7 @@ def t2_shift_imm : Operand<i32> {
// Shifted operands. No register controlled shifts for Thumb2.
// Note: We do not support rrx shifted operands yet.
def t2_so_reg : Operand<i32>, // reg imm
- ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
+ ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
[shl,srl,sra,rotr]> {
let EncoderMethod = "getT2SORegOpValue";
let PrintMethod = "printT2SOOperand";
@@ -1554,19 +1554,21 @@ def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
// ldrd / strd pre / post variants
-// For disassembly only.
+let mayLoad = 1 in
def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
(ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
"ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
let DecoderMethod = "DecodeT2LDRDPreInstruction";
}
+let mayLoad = 1 in
def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
(ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
"$addr.base = $wb", []>;
+let mayStore = 1 in
def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
(ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
@@ -1574,6 +1576,7 @@ def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
let DecoderMethod = "DecodeT2STRDPreInstruction";
}
+let mayStore = 1 in
def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
(ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
t2am_imm8s4_offset:$imm),
@@ -2100,7 +2103,7 @@ def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR),
def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-24} = 0b010;
let Inst{23} = 0b1;
@@ -2117,7 +2120,7 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
dag iops = (ins rGPR:$Rn, rGPR:$Rm),
string asm = "\t$Rd, $Rn, $Rm">
: T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0101;
let Inst{22-20} = op22_20;
@@ -2215,13 +2218,13 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm),
NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{15-12} = 0b1111;
}
def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
"usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
// Signed/Unsigned saturate.
class T2SatI<dag oops, dag iops, InstrItinClass itin,
@@ -2254,7 +2257,7 @@ def t2SSAT: T2SatI<
def t2SSAT16: T2SatI<
(outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
"ssat16", "\t$Rd, $sat_imm, $Rn", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11110;
let Inst{25-22} = 0b1100;
let Inst{20} = 0;
@@ -2278,7 +2281,7 @@ def t2USAT: T2SatI<
def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
NoItinerary,
"usat16", "\t$Rd, $sat_imm, $Rn", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-22} = 0b1111001110;
let Inst{20} = 0;
let Inst{15} = 0;
@@ -2288,8 +2291,8 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
let Inst{5-4} = 0b00;
}
-def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
-def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
//===----------------------------------------------------------------------===//
// Shift and rotate Instructions.
@@ -2605,7 +2608,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110,
(outs rGPR:$RdLo, rGPR:$RdHi),
(ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
"umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
} // hasSideEffects
// Rounding variants of the below included for disassembly only
@@ -2614,7 +2617,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110,
def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
"smmul", "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b101;
@@ -2624,7 +2627,7 @@ def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
"smmulr", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b101;
@@ -2636,7 +2639,7 @@ def t2SMMLA : T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmla", "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b101;
@@ -2646,7 +2649,7 @@ def t2SMMLA : T2FourReg<
def t2SMMLAR: T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b101;
@@ -2657,7 +2660,7 @@ def t2SMMLS: T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmls", "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b110;
@@ -2667,7 +2670,7 @@ def t2SMMLS: T2FourReg<
def t2SMMLSR:T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b110;
@@ -2679,7 +2682,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
!strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
(sext_inreg rGPR:$Rm, i16)))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2692,7 +2695,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
!strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
(sra rGPR:$Rm, (i32 16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2705,7 +2708,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
!strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
(sext_inreg rGPR:$Rm, i16)))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2718,7 +2721,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
!strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
(sra rGPR:$Rm, (i32 16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2730,7 +2733,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
!strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
[]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2742,7 +2745,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
!strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
[]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2760,7 +2763,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
[(set rGPR:$Rd, (add rGPR:$Ra,
(opnode (sext_inreg rGPR:$Rn, i16),
(sext_inreg rGPR:$Rm, i16))))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2773,7 +2776,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
(sra rGPR:$Rm, (i32 16)))))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2786,7 +2789,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
(sext_inreg rGPR:$Rm, i16))))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2799,7 +2802,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
(sra rGPR:$Rm, (i32 16)))))]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2811,7 +2814,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
!strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
[]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2823,7 +2826,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
!strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
[]>,
- Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+ Requires<[IsThumb2, HasDSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2839,79 +2842,79 @@ defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
[/* For disassembly only; pattern left blank */]>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm",
[/* For disassembly only; pattern left blank */]>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm",
[/* For disassembly only; pattern left blank */]>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm",
[/* For disassembly only; pattern left blank */]>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
def t2SMUAD: T2ThreeReg_mac<
0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{15-12} = 0b1111;
}
def t2SMUADX:T2ThreeReg_mac<
0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{15-12} = 0b1111;
}
def t2SMUSD: T2ThreeReg_mac<
0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{15-12} = 0b1111;
}
def t2SMUSDX:T2ThreeReg_mac<
0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasDSP]> {
let Inst{15-12} = 0b1111;
}
def t2SMLAD : T2FourReg_mac<
0, 0b010, 0b0000, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
"\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLADX : T2FourReg_mac<
0, 0b010, 0b0001, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx",
"\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd",
"\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx",
"\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
"\t$Ra, $Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
"\t$Ra, $Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
"\t$Ra, $Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
(ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
"\t$Ra, $Rd, $Rn, $Rm", []>,
- Requires<[IsThumb2, HasThumb2DSP]>;
+ Requires<[IsThumb2, HasDSP]>;
//===----------------------------------------------------------------------===//
// Division Instructions.
@@ -2961,7 +2964,7 @@ def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
"rbit", "\t$Rd, $Rm",
- [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>,
+ [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>,
Sched<[WriteALU]>;
def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index e83f8c850632..050cd1a445ad 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -20,7 +20,6 @@ def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>;
def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
-
//===----------------------------------------------------------------------===//
// Operand Definitions.
//
@@ -93,7 +92,7 @@ def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
IIC_fpLoad32, "vldr", "\t$Sd, $addr",
- [(set SPR:$Sd, (load addrmode5:$addr))]> {
+ [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> {
// Some single precision VFP instructions may be executed on both NEON and VFP
// pipelines.
let D = VFPNeonDomain;
@@ -107,7 +106,7 @@ def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
IIC_fpStore32, "vstr", "\t$Sd, $addr",
- [(store SPR:$Sd, addrmode5:$addr)]> {
+ [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> {
// Some single precision VFP instructions may be executed on both NEON and VFP
// pipelines.
let D = VFPNeonDomain;
@@ -393,8 +392,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
}
}
-defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
-defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
// Match reassociated forms only if not sign dependent rounding.
def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
@@ -541,19 +540,23 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
// FIXME: Verify encoding after integrated assembler is working.
def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>;
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>;
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>;
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>;
+ [/* For disassembly only; pattern left blank */]>,
+ Requires<[HasFP16]>;
def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
@@ -922,6 +925,22 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
let isRegSequence = 1;
}
+// Hoist an fabs or a fneg of a value coming from integer registers
+// and do the fabs/fneg on the integer value. This is never a lose
+// and could enable the conversion to float to be removed completely.
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+ Requires<[IsARM]>;
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+ Requires<[IsThumb2]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>,
+ Requires<[IsARM]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+ (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>,
+ Requires<[IsThumb2]>;
+
let hasSideEffects = 0 in
def VMOVSRR : AVConv5I<0b11000100, 0b1010,
(outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
@@ -1003,7 +1022,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
def : VFPPat<(f64 (sint_to_fp GPR:$a)),
(VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
- def : VFPPat<(f64 (sint_to_fp (i32 (load addrmode5:$a)))),
+ def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOD (VLDRS addrmode5:$a))>;
}
@@ -1021,7 +1040,7 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
(VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (load addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOS (VLDRS addrmode5:$a))>;
def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -1035,7 +1054,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
def : VFPPat<(f64 (uint_to_fp GPR:$a)),
(VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
- def : VFPPat<(f64 (uint_to_fp (i32 (load addrmode5:$a)))),
+ def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOD (VLDRS addrmode5:$a))>;
}
@@ -1053,7 +1072,7 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
(VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (load addrmode5:$a)))),
+def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOS (VLDRS addrmode5:$a))>;
// FP -> Int:
@@ -1106,7 +1125,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
(COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
- def : VFPPat<(store (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+ def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
(VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
}
@@ -1124,7 +1143,8 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
(COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
-def : VFPNoNEONPat<(store (i32 (fp_to_sint (f32 SPR:$a))), addrmode5:$ptr),
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+ addrmode5:$ptr),
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
@@ -1138,7 +1158,7 @@ let Predicates=[HasVFP2, HasDPVFP] in {
def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
(COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
- def : VFPPat<(store (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+ def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
(VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
}
@@ -1156,7 +1176,8 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
(COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
-def : VFPNoNEONPat<(store (i32 (fp_to_uint (f32 SPR:$a))), addrmode5:$ptr),
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+ addrmode5:$ptr),
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 265b86f75f1d..725b8383c961 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,17 +60,24 @@ STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm");
STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's");
STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's");
+namespace llvm {
+void initializeARMLoadStoreOptPass(PassRegistry &);
+}
+
+#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
+
namespace {
/// Post- register allocation pass the combine load / store instructions to
/// form ldm / stm instructions.
struct ARMLoadStoreOpt : public MachineFunctionPass {
static char ID;
- ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+ ARMLoadStoreOpt() : MachineFunctionPass(ID) {
+ initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
+ }
const MachineFunction *MF;
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
- const MachineRegisterInfo *MRI;
const ARMSubtarget *STI;
const TargetLowering *TL;
ARMFunctionInfo *AFI;
@@ -84,7 +91,7 @@ namespace {
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "ARM load / store optimization pass";
+ return ARM_LOAD_STORE_OPT_NAME;
}
private:
@@ -118,6 +125,7 @@ namespace {
};
SpecificBumpPtrAllocator<MergeCandidate> Allocator;
SmallVector<const MergeCandidate*,4> Candidates;
+ SmallVector<MachineInstr*,4> MergeBaseCandidates;
void moveLiveRegsBefore(const MachineBasicBlock &MBB,
MachineBasicBlock::const_iterator Before);
@@ -140,12 +148,16 @@ namespace {
MachineBasicBlock::iterator &MBBI);
bool MergeBaseUpdateLoadStore(MachineInstr *MI);
bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+ bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+ bool CombineMovBx(MachineBasicBlock &MBB);
};
char ARMLoadStoreOpt::ID = 0;
}
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
+
static bool definesCPSR(const MachineInstr *MI) {
for (const auto &MO : MI->operands()) {
if (!MO.isReg())
@@ -619,9 +631,10 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
unsigned NewBase;
if (isi32Load(Opcode)) {
- // If it is a load, then just use one of the destination register to
- // use as the new base.
+ // If it is a load, then just use one of the destination registers
+ // as the new base. Will no longer be writeback in Thumb1.
NewBase = Regs[NumRegs-1].first;
+ Writeback = false;
} else {
// Find a free register that we can use as scratch register.
moveLiveRegsBefore(MBB, InsertBefore);
@@ -725,9 +738,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
MachineInstrBuilder MIB;
if (Writeback) {
- if (Opcode == ARM::tLDMIA)
+ assert(isThumb1 && "expected Writeback only inThumb1");
+ if (Opcode == ARM::tLDMIA) {
+ assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
// Update tLDMIA with writeback if necessary.
Opcode = ARM::tLDMIA_UPD;
+ }
MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
@@ -784,6 +800,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
SmallVector<std::pair<unsigned, bool>, 8> Regs;
SmallVector<unsigned, 4> ImpDefs;
DenseSet<unsigned> KilledRegs;
+ DenseSet<unsigned> UsedRegs;
// Determine list of registers and list of implicit super-register defs.
for (const MachineInstr *MI : Cand.Instrs) {
const MachineOperand &MO = getLoadStoreRegOp(*MI);
@@ -792,6 +809,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
if (IsKill)
KilledRegs.insert(Reg);
Regs.push_back(std::make_pair(Reg, IsKill));
+ UsedRegs.insert(Reg);
if (IsLoad) {
// Collect any implicit defs of super-registers, after merging we can't
@@ -881,7 +899,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
for (MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || !MO.isKill())
continue;
- if (KilledRegs.count(MO.getReg()))
+ if (UsedRegs.count(MO.getReg()))
MO.setIsKill(false);
}
}
@@ -995,76 +1013,6 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
} while (SIndex < EIndex);
}
-static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
- unsigned Bytes, unsigned Limit,
- ARMCC::CondCodes Pred, unsigned PredReg) {
- unsigned MyPredReg = 0;
- if (!MI)
- return false;
-
- bool CheckCPSRDef = false;
- switch (MI->getOpcode()) {
- default: return false;
- case ARM::tSUBi8:
- case ARM::t2SUBri:
- case ARM::SUBri:
- CheckCPSRDef = true;
- break;
- case ARM::tSUBspi:
- break;
- }
-
- // Make sure the offset fits in 8 bits.
- if (Bytes == 0 || (Limit && Bytes >= Limit))
- return false;
-
- unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
- MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
- if (!(MI->getOperand(0).getReg() == Base &&
- MI->getOperand(1).getReg() == Base &&
- (MI->getOperand(2).getImm() * Scale) == Bytes &&
- getInstrPredicate(MI, MyPredReg) == Pred &&
- MyPredReg == PredReg))
- return false;
-
- return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
- unsigned Bytes, unsigned Limit,
- ARMCC::CondCodes Pred, unsigned PredReg) {
- unsigned MyPredReg = 0;
- if (!MI)
- return false;
-
- bool CheckCPSRDef = false;
- switch (MI->getOpcode()) {
- default: return false;
- case ARM::tADDi8:
- case ARM::t2ADDri:
- case ARM::ADDri:
- CheckCPSRDef = true;
- break;
- case ARM::tADDspi:
- break;
- }
-
- if (Bytes == 0 || (Limit && Bytes >= Limit))
- // Make sure the offset fits in 8 bits.
- return false;
-
- unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
- MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
- if (!(MI->getOperand(0).getReg() == Base &&
- MI->getOperand(1).getReg() == Base &&
- (MI->getOperand(2).getImm() * Scale) == Bytes &&
- getInstrPredicate(MI, MyPredReg) == Pred &&
- MyPredReg == PredReg))
- return false;
-
- return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
ARM_AM::AMSubMode Mode) {
switch (Opc) {
@@ -1132,6 +1080,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
}
}
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg) {
+ bool CheckCPSRDef;
+ int Scale;
+ switch (MI.getOpcode()) {
+ case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break;
+ case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break;
+ case ARM::t2SUBri:
+ case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break;
+ case ARM::t2ADDri:
+ case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break;
+ case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break;
+ case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+ default: return 0;
+ }
+
+ unsigned MIPredReg;
+ if (MI.getOperand(0).getReg() != Reg ||
+ MI.getOperand(1).getReg() != Reg ||
+ getInstrPredicate(&MI, MIPredReg) != Pred ||
+ MIPredReg != PredReg)
+ return 0;
+
+ if (CheckCPSRDef && definesCPSR(&MI))
+ return 0;
+ return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+ Offset = 0;
+ MachineBasicBlock &MBB = *MBBI->getParent();
+ MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ if (MBBI == BeginMBBI)
+ return EndMBBI;
+
+ // Skip debug values.
+ MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+ while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+ --PrevMBBI;
+
+ Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+ return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+ ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+ Offset = 0;
+ MachineBasicBlock &MBB = *MBBI->getParent();
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+ // Skip debug values.
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+ ++NextMBBI;
+ if (NextMBBI == EndMBBI)
+ return EndMBBI;
+
+ Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+ return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
/// Fold proceeding/trailing inc/dec of base register into the
/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
///
@@ -1151,7 +1168,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
const MachineOperand &BaseOP = MI->getOperand(0);
unsigned Base = BaseOP.getReg();
bool BaseKill = BaseOP.isKill();
- unsigned Bytes = getLSMultipleTransferSize(MI);
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
unsigned Opcode = MI->getOpcode();
@@ -1163,49 +1179,24 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
if (MI->getOperand(i).getReg() == Base)
return false;
- bool DoMerge = false;
- ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-
- // Try merging with the previous instruction.
+ int Bytes = getLSMultipleTransferSize(MI);
MachineBasicBlock &MBB = *MI->getParent();
- MachineBasicBlock::iterator BeginMBBI = MBB.begin();
MachineBasicBlock::iterator MBBI(MI);
- if (MBBI != BeginMBBI) {
- MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
- while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
- --PrevMBBI;
- if (Mode == ARM_AM::ia &&
- isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
- Mode = ARM_AM::db;
- DoMerge = true;
- } else if (Mode == ARM_AM::ib &&
- isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
- Mode = ARM_AM::da;
- DoMerge = true;
- }
- if (DoMerge)
- MBB.erase(PrevMBBI);
- }
-
- // Try merging with the next instruction.
- MachineBasicBlock::iterator EndMBBI = MBB.end();
- if (!DoMerge && MBBI != EndMBBI) {
- MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
- while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
- ++NextMBBI;
- if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
- isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
- DoMerge = true;
- } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
- isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
- DoMerge = true;
- }
- if (DoMerge)
- MBB.erase(NextMBBI);
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr
+ = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+ ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+ if (Mode == ARM_AM::ia && Offset == -Bytes) {
+ Mode = ARM_AM::db;
+ } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+ Mode = ARM_AM::da;
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+ ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
+ return false;
}
-
- if (!DoMerge)
- return false;
+ MBB.erase(MergeInstr);
unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1283,7 +1274,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
unsigned Base = getLoadStoreBaseOp(*MI).getReg();
bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
- unsigned Bytes = getLSMultipleTransferSize(MI);
unsigned Opcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
@@ -1295,7 +1285,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
return false;
- bool isLd = isLoadSingle(Opcode);
// Can't do the merge if the destination register is the same as the would-be
// writeback register.
if (MI->getOperand(0).getReg() == Base)
@@ -1303,55 +1292,31 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
- bool DoMerge = false;
- ARM_AM::AddrOpc AddSub = ARM_AM::add;
- unsigned NewOpc = 0;
- // AM2 - 12 bits, thumb2 - 8 bits.
- unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
-
- // Try merging with the previous instruction.
+ int Bytes = getLSMultipleTransferSize(MI);
MachineBasicBlock &MBB = *MI->getParent();
- MachineBasicBlock::iterator BeginMBBI = MBB.begin();
MachineBasicBlock::iterator MBBI(MI);
- if (MBBI != BeginMBBI) {
- MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
- while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
- --PrevMBBI;
- if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
- DoMerge = true;
- AddSub = ARM_AM::sub;
- } else if (!isAM5 &&
- isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
- DoMerge = true;
- }
- if (DoMerge) {
- NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
- MBB.erase(PrevMBBI);
- }
- }
-
- // Try merging with the next instruction.
- MachineBasicBlock::iterator EndMBBI = MBB.end();
- if (!DoMerge && MBBI != EndMBBI) {
- MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
- while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
- ++NextMBBI;
- if (!isAM5 &&
- isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
- DoMerge = true;
- AddSub = ARM_AM::sub;
- } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
- DoMerge = true;
- }
- if (DoMerge) {
- NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
- MBB.erase(NextMBBI);
- }
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr
+ = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+ unsigned NewOpc;
+ if (!isAM5 && Offset == Bytes) {
+ NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+ } else if (Offset == -Bytes) {
+ NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (Offset == Bytes) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+ } else if (!isAM5 && Offset == -Bytes) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+ } else
+ return false;
}
+ MBB.erase(MergeInstr);
- if (!DoMerge)
- return false;
+ ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+ bool isLd = isLoadSingle(Opcode);
if (isAM5) {
// VLDM[SD]_UPD, VSTM[SD]_UPD
// (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -1368,18 +1333,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (isAM2) {
// LDR_PRE, LDR_POST
if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
- int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
} else {
- int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
- .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+ .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
}
} else {
- int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
// t2LDR_PRE, t2LDR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
@@ -1391,13 +1354,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// the vestigal zero-reg offset register. When that's fixed, this clause
// can be removed entirely.
if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
- int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
// STR_PRE, STR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
- .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+ .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
} else {
- int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
// t2STR_PRE, t2STR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
@@ -1409,46 +1371,75 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
return true;
}
-/// Returns true if instruction is a memory operation that this pass is capable
-/// of operating on.
-static bool isMemoryOp(const MachineInstr *MI) {
- // When no memory operands are present, conservatively assume unaligned,
- // volatile, unfoldable.
- if (!MI->hasOneMemOperand())
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+ "Must have t2STRDi8 or t2LDRDi8");
+ if (MI.getOperand(3).getImm() != 0)
return false;
- const MachineMemOperand *MMO = *MI->memoperands_begin();
-
- // Don't touch volatile memory accesses - we may be changing their order.
- if (MMO->isVolatile())
+ // Behaviour for writeback is undefined if base register is the same as one
+ // of the others.
+ const MachineOperand &BaseOp = MI.getOperand(2);
+ unsigned Base = BaseOp.getReg();
+ const MachineOperand &Reg0Op = MI.getOperand(0);
+ const MachineOperand &Reg1Op = MI.getOperand(1);
+ if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
return false;
- // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
- // not.
- if (MMO->getAlignment() < 4)
- return false;
+ unsigned PredReg;
+ ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+ MachineBasicBlock::iterator MBBI(MI);
+ MachineBasicBlock &MBB = *MI.getParent();
+ int Offset;
+ MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+ PredReg, Offset);
+ unsigned NewOpc;
+ if (Offset == 8 || Offset == -8) {
+ NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+ } else {
+ MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+ if (Offset == 8 || Offset == -8) {
+ NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+ } else
+ return false;
+ }
+ MBB.erase(MergeInstr);
- // str <undef> could probably be eliminated entirely, but for now we just want
- // to avoid making a mess of it.
- // FIXME: Use str <undef> as a wildcard to enable better stm folding.
- if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
- MI->getOperand(0).isUndef())
- return false;
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+ if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+ MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+ .addReg(BaseOp.getReg(), RegState::Define);
+ } else {
+ assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+ MIB.addReg(BaseOp.getReg(), RegState::Define)
+ .addOperand(Reg0Op).addOperand(Reg1Op);
+ }
+ MIB.addReg(BaseOp.getReg(), RegState::Kill)
+ .addImm(Offset).addImm(Pred).addReg(PredReg);
+ assert(TII->get(Opcode).getNumOperands() == 6 &&
+ TII->get(NewOpc).getNumOperands() == 7 &&
+ "Unexpected number of operands in Opcode specification.");
- // Likewise don't mess with references to undefined addresses.
- if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
- MI->getOperand(1).isUndef())
- return false;
+ // Transfer implicit operands.
+ for (const MachineOperand &MO : MI.implicit_operands())
+ MIB.addOperand(MO);
+ MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
- unsigned Opcode = MI->getOpcode();
+ MBB.erase(MBBI);
+ return true;
+}
+
+/// Returns true if instruction is a memory operation that this pass is capable
+/// of operating on.
+static bool isMemoryOp(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- default: break;
case ARM::VLDRS:
case ARM::VSTRS:
- return MI->getOperand(1).isReg();
case ARM::VLDRD:
case ARM::VSTRD:
- return MI->getOperand(1).isReg();
case ARM::LDRi12:
case ARM::STRi12:
case ARM::tLDRi:
@@ -1459,9 +1450,40 @@ static bool isMemoryOp(const MachineInstr *MI) {
case ARM::t2LDRi12:
case ARM::t2STRi8:
case ARM::t2STRi12:
- return MI->getOperand(1).isReg();
+ break;
+ default:
+ return false;
}
- return false;
+ if (!MI.getOperand(1).isReg())
+ return false;
+
+ // When no memory operands are present, conservatively assume unaligned,
+ // volatile, unfoldable.
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand &MMO = **MI.memoperands_begin();
+
+ // Don't touch volatile memory accesses - we may be changing their order.
+ if (MMO.isVolatile())
+ return false;
+
+ // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+ // not.
+ if (MMO.getAlignment() < 4)
+ return false;
+
+ // str <undef> could probably be eliminated entirely, but for now we just want
+ // to avoid making a mess of it.
+ // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+ if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
+ return false;
+
+ // Likewise don't mess with references to undefined addresses.
+ if (MI.getOperand(1).isUndef())
+ return false;
+
+ return true;
}
static void InsertLDR_STR(MachineBasicBlock &MBB,
@@ -1616,6 +1638,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
ARMCC::CondCodes CurrPred = ARMCC::AL;
unsigned Position = 0;
assert(Candidates.size() == 0);
+ assert(MergeBaseCandidates.size() == 0);
LiveRegsValid = false;
for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
@@ -1626,7 +1649,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
continue;
++Position;
- if (isMemoryOp(MBBI)) {
+ if (isMemoryOp(*MBBI)) {
unsigned Opcode = MBBI->getOpcode();
const MachineOperand &MO = MBBI->getOperand(0);
unsigned Reg = MO.getReg();
@@ -1694,8 +1717,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
MBBI = I;
--Position;
// Fallthrough to look into existing chain.
- } else if (MBBI->isDebugValue())
+ } else if (MBBI->isDebugValue()) {
continue;
+ } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+ MBBI->getOpcode() == ARM::t2STRDi8) {
+ // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+ // remember them because we may still be able to merge add/sub into them.
+ MergeBaseCandidates.push_back(MBBI);
+ }
+
// If we are here then the chain is broken; Extract candidates for a merge.
if (MemOps.size() > 0) {
@@ -1726,7 +1756,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
if (Merged) {
Changed = true;
unsigned Opcode = Merged->getOpcode();
- if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
+ if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+ MergeBaseUpdateLSDouble(*Merged);
+ else
MergeBaseUpdateLSMultiple(Merged);
} else {
for (MachineInstr *MI : Candidate->Instrs) {
@@ -1741,6 +1773,10 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
}
}
Candidates.clear();
+ // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+ for (MachineInstr *MI : MergeBaseCandidates)
+ MergeBaseUpdateLSDouble(*MI);
+ MergeBaseCandidates.clear();
return Changed;
}
@@ -1765,7 +1801,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
(MBBI->getOpcode() == ARM::BX_RET ||
MBBI->getOpcode() == ARM::tBX_RET ||
MBBI->getOpcode() == ARM::MOVPCLR)) {
- MachineInstr *PrevMI = std::prev(MBBI);
+ MachineBasicBlock::iterator PrevI = std::prev(MBBI);
+ // Ignore any DBG_VALUE instructions.
+ while (PrevI->isDebugValue() && PrevI != MBB.begin())
+ --PrevI;
+ MachineInstr *PrevMI = PrevI;
unsigned Opcode = PrevMI->getOpcode();
if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
@@ -1786,6 +1826,30 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
return false;
}
+bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ if (MBBI == MBB.begin() || MBBI == MBB.end() ||
+ MBBI->getOpcode() != ARM::tBX_RET)
+ return false;
+
+ MachineBasicBlock::iterator Prev = MBBI;
+ --Prev;
+ if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
+ return false;
+
+ for (auto Use : Prev->uses())
+ if (Use.isKill()) {
+ AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+ .addReg(Use.getReg(), RegState::Kill))
+ .copyImplicitOps(&*MBBI);
+ MBB.erase(MBBI);
+ MBB.erase(Prev);
+ return true;
+ }
+
+ llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
+}
+
bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
MF = &Fn;
STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -1793,7 +1857,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
AFI = Fn.getInfo<ARMFunctionInfo>();
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
- MRI = &Fn.getRegInfo();
+
RegClassInfoValid = false;
isThumb2 = AFI->isThumb2Function();
isThumb1 = AFI->isThumbFunction() && !isThumb2;
@@ -1805,18 +1869,29 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
Modified |= LoadStoreMultipleOpti(MBB);
if (STI->hasV5TOps())
Modified |= MergeReturnIntoLDM(MBB);
+ if (isThumb1)
+ Modified |= CombineMovBx(MBB);
}
Allocator.DestroyAll();
return Modified;
}
+namespace llvm {
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+}
+
+#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \
+ "ARM pre- register allocation load / store optimization pass"
+
namespace {
/// Pre- register allocation pass that move load / stores from consecutive
/// locations close to make it more likely they will be combined later.
struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
static char ID;
- ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+ ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
+ initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
+ }
const DataLayout *TD;
const TargetInstrInfo *TII;
@@ -1828,7 +1903,7 @@ namespace {
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "ARM pre- register allocation load / store optimization pass";
+ return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
}
private:
@@ -1847,8 +1922,11 @@ namespace {
char ARMPreAllocLoadStoreOpt::ID = 0;
}
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
+ ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+
bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
- TD = Fn.getTarget().getDataLayout();
+ TD = &Fn.getDataLayout();
STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
@@ -1856,9 +1934,8 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
MF = &Fn;
bool Modified = false;
- for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
- ++MFI)
- Modified |= RescheduleLoadStoreInstrs(MFI);
+ for (MachineBasicBlock &MFI : Fn)
+ Modified |= RescheduleLoadStoreInstrs(&MFI);
return Modified;
}
@@ -2187,7 +2264,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
if (!MI->isDebugValue())
MI2LocMap[MI] = ++Loc;
- if (!isMemoryOp(MI))
+ if (!isMemoryOp(*MI))
continue;
unsigned PredReg = 0;
if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
@@ -2275,3 +2352,4 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
return new ARMPreAllocLoadStoreOpt();
return new ARMLoadStoreOpt();
}
+
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index f5250ff83f0b..ac0330fbcb34 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===//
+//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -20,5 +20,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
RestoreSPFromFP(false), LRSpilledForFarJump(false),
FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
- PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
- GlobalBaseReg(0) {}
+ PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 14dd9ef333af..d6447978ef2c 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -52,7 +52,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
unsigned ReturnRegsCount;
/// HasStackFrame - True if this function has a stack frame. Set by
- /// processFunctionBeforeCalleeSavedScan().
+ /// determineCalleeSaves().
bool HasStackFrame;
/// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
@@ -110,11 +110,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// pass.
DenseMap<unsigned, unsigned> CPEClones;
- /// GlobalBaseReg - keeps track of the virtual register initialized for
- /// use as the global base register. This is used for PIC in some PIC
- /// relocation models.
- unsigned GlobalBaseReg;
-
/// ArgumentStackSize - amount of bytes on stack consumed by the arguments
/// being passed on the stack
unsigned ArgumentStackSize;
@@ -133,7 +128,7 @@ public:
FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
NumAlignedDPRCS2Regs(0), PICLabelUId(0),
- VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+ VarArgsFrameIndex(0), HasITBlocks(false) {}
explicit ARMFunctionInfo(MachineFunction &MF);
@@ -204,9 +199,6 @@ public:
bool hasITBlocks() const { return HasITBlocks; }
void setHasITBlocks(bool h) { HasITBlocks = h; }
- unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
- void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
-
void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
llvm_unreachable("Duplicate entries!");
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 45cc9ea91f37..02cbfb1fa9f1 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
}
// Scalar single precision floating point register class..
-// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
-// avoid partial-write dependencies on D registers (S registers are
-// renamed as portions of D registers).
-def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
- (sequence "S%u", 0, 31), 2),
- (sequence "S%u", 0, 31))>;
+// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
+// to avoid partial-write dependencies on D or Q (depending on platform)
+// registers (S registers are renamed as portions of D/Q registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
+ let AltOrders = [(add (decimate SPR, 2), SPR),
+ (add (decimate SPR, 4),
+ (decimate SPR, 2),
+ (decimate (rotl SPR, 1), 4),
+ (decimate (rotl SPR, 1), 2))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ }];
+}
// Subset of SPR which can be used as a source of NEON scalars for 16-bit
// operations
@@ -281,25 +288,29 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
// class.
// ARM requires only word alignment for double. It's more performant if it
// is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
(sequence "D%u", 0, 31)> {
- // Allocate non-VFP2 registers D16-D31 first.
- let AltOrders = [(rotl DPR, 16)];
- let AltOrderSelect = [{ return 1; }];
+ // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
+ // Darwin platforms.
+ let AltOrders = [(rotl DPR, 16),
+ (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
+ let AltOrderSelect = [{
+ return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+ }];
}
// Subset of DPR that are accessible with VFP2 (and so that also have
// 32-bit SPR subregs).
-def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
(trunc DPR, 16)>;
// Subset of DPR which can be used as a source of NEON scalars for 16-bit
// operations
-def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
(trunc DPR, 8)>;
// Generic 128-bit vector register class.
-def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
(sequence "Q%u", 0, 15)> {
// Allocate non-VFP2 aliases Q8-Q15 first.
let AltOrders = [(rotl QPR, 8)];
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index b03d5ff44c6e..3ad7730228e5 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -37,1050 +37,13 @@ def SW_FDIV : FuncUnit;
// FIXME: Add preload instruction when it is documented.
// FIXME: Model non-pipelined nature of FP div / sqrt unit.
-def SwiftItineraries : ProcessorItineraries<
- [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
- //
- // Move instructions, unconditional
- InstrItinData<IIC_iMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2]>,
- InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [3]>,
- InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_LS]>],
- [5]>,
- //
- // MVN instructions
- InstrItinData<IIC_iMVNi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMVNr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMVNsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iMVNsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- //
- // No operand cycles
- InstrItinData<IIC_iALUx , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
- //
- // Binary Instructions that produce a result
- InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1, 1]>,
- //
- // Bitwise Instructions that produce a result
- InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1, 1]>,
- //
- // Unary Instructions that produce a result
-
- // CLZ, RBIT, etc.
- InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
-
- // BFC, BFI, UBFX, SBFX
- InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1]>,
-
- //
- // Zero and sign extension instructions
- InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1, 1]>,
- //
- // Compare instructions
- InstrItinData<IIC_iCMPi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iCMPr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iCMPsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<2, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iCMPsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<2, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1]>,
- //
- // Test instructions
- InstrItinData<IIC_iTSTi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iTSTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iTSTsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<2, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iTSTsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<2, [SW_ALU0, SW_ALU1]>],
- [1, 1, 1]>,
- //
- // Move instructions, conditional
- // FIXME: Correctly model the extra input dep on the destination.
- InstrItinData<IIC_iCMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- InstrItinData<IIC_iCMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1, 1]>,
- InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2]>,
-
- // Integer multiply pipeline
- //
- InstrItinData<IIC_iMUL16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [3, 1, 1]>,
- InstrItinData<IIC_iMAC16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [3, 1, 1, 1]>,
- InstrItinData<IIC_iMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- InstrItinData<IIC_iMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1, 1]>,
- InstrItinData<IIC_iMUL64 , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0], 1>,
- InstrStage<1, [SW_ALU0], 3>,
- InstrStage<1, [SW_ALU0]>],
- [5, 5, 1, 1]>,
- InstrItinData<IIC_iMAC64 , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0], 1>,
- InstrStage<1, [SW_ALU0], 1>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [5, 6, 1, 1]>,
- //
- // Integer divide
- InstrItinData<IIC_iDIV , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0], 0>,
- InstrStage<14, [SW_IDIV]>],
- [14, 1, 1]>,
-
- // Integer load pipeline
- // FIXME: The timings are some rough approximations
- //
- // Immediate offset
- InstrItinData<IIC_iLoad_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1]>,
- InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1]>,
- InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_LS]>],
- [3, 4, 1]>,
- //
- // Register offset
- InstrItinData<IIC_iLoad_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1]>,
- InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1]>,
- InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [3, 4, 1, 1]>,
- //
- // Scaled register offset
- InstrItinData<IIC_iLoad_si , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS]>],
- [5, 1, 1]>,
- InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS]>],
- [5, 1, 1]>,
- //
- // Immediate offset with update
- InstrItinData<IIC_iLoad_iu , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1]>,
- InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1]>,
- //
- // Register offset with update
- InstrItinData<IIC_iLoad_ru , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0], 1>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1, 1]>,
- InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0], 1>,
- InstrStage<1, [SW_LS]>],
- [3, 1, 1, 1]>,
- InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [3, 4, 1, 1]>,
- //
- // Scaled register offset with update
- InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [5, 3, 1, 1]>,
- InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [5, 3, 1, 1]>,
- //
- // Load multiple, def is the 5th operand.
- // FIXME: This assumes 3 to 4 registers.
- InstrItinData<IIC_iLoad_m , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 1, 3], [], -1>, // dynamic uops
-
- //
- // Load multiple + update, defs are the 1st and 5th operands.
- InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1, 1, 3], [], -1>, // dynamic uops
- //
- // Load multiple plus branch
- InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 1, 3], [], -1>, // dynamic uops
- //
- // Pop, def is the 3rd operand.
- InstrItinData<IIC_iPop , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 3], [], -1>, // dynamic uops
- //
- // Pop + branch, def is the 3rd operand.
- InstrItinData<IIC_iPop_Br, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 3], [], -1>, // dynamic uops
-
- //
- // iLoadi + iALUr for t2LDRpci_pic.
- InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [4, 1]>,
-
- // Integer store pipeline
- ///
- // Immediate offset
- InstrItinData<IIC_iStore_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1]>,
- InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1]>,
- InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1]>,
- //
- // Register offset
- InstrItinData<IIC_iStore_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- //
- // Scaled register offset
- InstrItinData<IIC_iStore_si , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- //
- // Immediate offset with update
- InstrItinData<IIC_iStore_iu , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1]>,
- //
- // Register offset with update
- InstrItinData<IIC_iStore_ru , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 1]>,
- InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 1]>,
- InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 1]>,
- //
- // Scaled register offset with update
- InstrItinData<IIC_iStore_siu, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
- [3, 1, 1, 1]>,
- InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
- InstrStage<1, [SW_LS], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
- [3, 1, 1, 1]>,
- //
- // Store multiple
- InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [], [], -1>, // dynamic uops
- //
- // Store multiple + update
- InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS], 1>,
- InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
- InstrStage<1, [SW_LS]>],
- [2], [], -1>, // dynamic uops
-
- //
- // Preload
- InstrItinData<IIC_Preload, [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
-
- // Branch
- //
- // no delay slots, so the latency of a branch is unimportant
- InstrItinData<IIC_Br , [InstrStage<1, [SW_DIS0], 0>]>,
-
- // FP Special Register to Integer Register File Move
- InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [1]>,
- //
- // Single-precision FP Unary
- //
- // Most floating-point moves get issued on ALU0.
- InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1]>,
- //
- // Double-precision FP Unary
- InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1]>,
-
- //
- // Single-precision FP Compare
- InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [1, 1]>,
- //
- // Double-precision FP Compare
- InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [1, 1]>,
- //
- // Single to Double FP Convert
- InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
- //
- // Double to Single FP Convert
- InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
-
- //
- // Single to Half FP Convert
- InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU1], 4>,
- InstrStage<1, [SW_ALU1]>],
- [6, 1]>,
- //
- // Half to Single FP Convert
- InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
-
- //
- // Single-Precision FP to Integer Convert
- InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
- //
- // Double-Precision FP to Integer Convert
- InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
- //
- // Integer to Single-Precision FP Convert
- InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
- //
- // Integer to Double-Precision FP Convert
- InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1]>,
- //
- // Single-precision FP ALU
- InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Double-precision FP ALU
- InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Single-precision FP Multiply
- InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Double-precision FP Multiply
- InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [6, 1, 1]>,
- //
- // Single-precision FP MAC
- InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Double-precision FP MAC
- InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [12, 1, 1]>,
- //
- // Single-precision Fused FP MAC
- InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Double-precision Fused FP MAC
- InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [12, 1, 1]>,
- //
- // Single-precision FP DIV
- InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 0>,
- InstrStage<15, [SW_FDIV]>],
- [17, 1, 1]>,
- //
- // Double-precision FP DIV
- InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 0>,
- InstrStage<30, [SW_FDIV]>],
- [32, 1, 1]>,
- //
- // Single-precision FP SQRT
- InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 0>,
- InstrStage<15, [SW_FDIV]>],
- [17, 1]>,
- //
- // Double-precision FP SQRT
- InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 0>,
- InstrStage<30, [SW_FDIV]>],
- [32, 1, 1]>,
-
- //
- // Integer to Single-precision Move
- InstrItinData<IIC_fpMOVIS, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 4>,
- InstrStage<1, [SW_ALU0]>],
- [6, 1]>,
- //
- // Integer to Double-precision Move
- InstrItinData<IIC_fpMOVID, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [4, 1]>,
- //
- // Single-precision to Integer Move
- InstrItinData<IIC_fpMOVSI, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1]>,
- //
- // Double-precision to Integer Move
- InstrItinData<IIC_fpMOVDI, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_LS]>],
- [3, 4, 1]>,
- //
- // Single-precision FP Load
- InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [4, 1]>,
- //
- // Double-precision FP Load
- InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [4, 1]>,
- //
- // FP Load Multiple
- // FIXME: Assumes a single Q register.
- InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1, 4], [], -1>, // dynamic uops
- //
- // FP Load Multiple + update
- // FIXME: Assumes a single Q register.
- InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 4>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1, 1, 4], [], -1>, // dynamic uops
- //
- // Single-precision FP Store
- InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1]>,
- //
- // Double-precision FP Store
- InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1]>,
- //
- // FP Store Multiple
- // FIXME: Assumes a single Q register.
- InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [1, 1, 1], [], -1>, // dynamic uops
- //
- // FP Store Multiple + update
- // FIXME: Assumes a single Q register.
- InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 4>,
- InstrStage<1, [SW_ALU0, SW_ALU1]>],
- [2, 1, 1, 1], [], -1>, // dynamic uops
- // NEON
- //
- // Double-register Integer Unary
- InstrItinData<IIC_VUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1]>,
- //
- // Quad-register Integer Unary
- InstrItinData<IIC_VUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1]>,
- //
- // Double-register Integer Q-Unary
- InstrItinData<IIC_VQUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1]>,
- //
- // Quad-register Integer CountQ-Unary
- InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1]>,
- //
- // Double-register Integer Binary
- InstrItinData<IIC_VBINiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Quad-register Integer Binary
- InstrItinData<IIC_VBINiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Double-register Integer Subtract
- InstrItinData<IIC_VSUBiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Quad-register Integer Subtract
- InstrItinData<IIC_VSUBiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Double-register Integer Shift
- InstrItinData<IIC_VSHLiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Quad-register Integer Shift
- InstrItinData<IIC_VSHLiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Double-register Integer Shift (4 cycle)
- InstrItinData<IIC_VSHLi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Shift (4 cycle)
- InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Double-register Integer Binary (4 cycle)
- InstrItinData<IIC_VBINi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Binary (4 cycle)
- InstrItinData<IIC_VBINi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Double-register Integer Subtract (4 cycle)
- InstrItinData<IIC_VSUBi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Subtract (4 cycle)
- InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
-
- //
- // Double-register Integer Count
- InstrItinData<IIC_VCNTiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Quad-register Integer Count
- InstrItinData<IIC_VCNTiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1, 1]>,
- //
- // Double-register Absolute Difference and Accumulate
- InstrItinData<IIC_VABAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1, 1]>,
- //
- // Quad-register Absolute Difference and Accumulate
- InstrItinData<IIC_VABAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1, 1]>,
- //
- // Double-register Integer Pair Add Long
- InstrItinData<IIC_VPALiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Pair Add Long
- InstrItinData<IIC_VPALiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
-
- //
- // Double-register Integer Multiply (.8, .16)
- InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Multiply (.8, .16)
- InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
-
- //
- // Double-register Integer Multiply (.32)
- InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Quad-register Integer Multiply (.32)
- InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Double-register Integer Multiply-Accumulate (.8, .16)
- InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1, 1]>,
- //
- // Double-register Integer Multiply-Accumulate (.32)
- InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1, 1]>,
- //
- // Quad-register Integer Multiply-Accumulate (.8, .16)
- InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1, 1]>,
- //
- // Quad-register Integer Multiply-Accumulate (.32)
- InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1, 1]>,
-
- //
- // Move
- InstrItinData<IIC_VMOV, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1]>,
- //
- // Move Immediate
- InstrItinData<IIC_VMOVImm, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2]>,
- //
- // Double-register Permute Move
- InstrItinData<IIC_VMOVD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1]>,
- //
- // Quad-register Permute Move
- InstrItinData<IIC_VMOVQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1]>,
- //
- // Integer to Single-precision Move
- InstrItinData<IIC_VMOVIS , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 4>,
- InstrStage<1, [SW_ALU0]>],
- [6, 1]>,
- //
- // Integer to Double-precision Move
- InstrItinData<IIC_VMOVID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [4, 1, 1]>,
- //
- // Single-precision to Integer Move
- InstrItinData<IIC_VMOVSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_LS]>],
- [3, 1]>,
- //
- // Double-precision to Integer Move
- InstrItinData<IIC_VMOVDI , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 3>,
- InstrStage<1, [SW_LS]>],
- [3, 4, 1]>,
- //
- // Integer to Lane Move
- // FIXME: I think this is correct, but it is not clear from the tuning guide.
- InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_LS], 4>,
- InstrStage<1, [SW_ALU0]>],
- [6, 1]>,
-
- //
- // Vector narrow move
- InstrItinData<IIC_VMOVN, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1]>,
- //
- // Double-register FP Unary
- // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
- // and they issue on a different pipeline.
- InstrItinData<IIC_VUNAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1]>,
- //
- // Quad-register FP Unary
- // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
- // and they issue on a different pipeline.
- InstrItinData<IIC_VUNAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [2, 1]>,
- //
- // Double-register FP Binary
- // FIXME: We're using this itin for many instructions.
- InstrItinData<IIC_VBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
-
- //
- // VPADD, etc.
- InstrItinData<IIC_VPBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Double-register FP VMUL
- InstrItinData<IIC_VFMULD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Quad-register FP Binary
- InstrItinData<IIC_VBINQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU0]>],
- [4, 1, 1]>,
- //
- // Quad-register FP VMUL
- InstrItinData<IIC_VFMULQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 1]>,
- //
- // Double-register FP Multiple-Accumulate
- InstrItinData<IIC_VMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Quad-register FP Multiple-Accumulate
- InstrItinData<IIC_VMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Double-register Fused FP Multiple-Accumulate
- InstrItinData<IIC_VFMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Quad-register FusedF P Multiple-Accumulate
- InstrItinData<IIC_VFMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Double-register Reciprical Step
- InstrItinData<IIC_VRECSD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Quad-register Reciprical Step
- InstrItinData<IIC_VRECSQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 1]>,
- //
- // Double-register Permute
- // FIXME: The latencies are unclear from the documentation.
- InstrItinData<IIC_VPERMD, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [3, 4, 3, 4]>,
- //
- // Quad-register Permute
- // FIXME: The latencies are unclear from the documentation.
- InstrItinData<IIC_VPERMQ, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [3, 4, 3, 4]>,
- //
- // Quad-register Permute (3 cycle issue on A9)
- InstrItinData<IIC_VPERMQ3, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [3, 4, 3, 4]>,
-
- //
- // Double-register VEXT
- InstrItinData<IIC_VEXTD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1, 1]>,
- //
- // Quad-register VEXT
- InstrItinData<IIC_VEXTQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1, 1]>,
- //
- // VTB
- InstrItinData<IIC_VTB1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_VTB2, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 3, 3]>,
- InstrItinData<IIC_VTB3, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [6, 1, 3, 5, 5]>,
- InstrItinData<IIC_VTB4, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 3, 5, 7, 7]>,
- //
- // VTBX
- InstrItinData<IIC_VTBX1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1]>],
- [2, 1, 1]>,
- InstrItinData<IIC_VTBX2, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [4, 1, 3, 3]>,
- InstrItinData<IIC_VTBX3, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [6, 1, 3, 5, 5]>,
- InstrItinData<IIC_VTBX4, [InstrStage<1, [SW_DIS0], 0>,
- InstrStage<1, [SW_DIS1], 0>,
- InstrStage<1, [SW_DIS2], 0>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1], 2>,
- InstrStage<1, [SW_ALU1]>],
- [8, 1, 3, 5, 7, 7]>
-]>;
-
-// ===---------------------------------------------------------------------===//
-// This following definitions describe the simple machine model which
-// will replace itineraries.
-
// Swift machine model for scheduling and other instruction cost heuristics.
def SwiftModel : SchedMachineModel {
let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
let MicroOpBufferSize = 45; // Based on NEON renamed registers.
let LoadLatency = 3;
let MispredictPenalty = 14; // A branch direction mispredict.
-
- let Itineraries = SwiftItineraries;
+ let CompleteModel = 0; // FIXME: Remove if all instructions are covered.
}
// Swift predicates.
@@ -1558,6 +521,13 @@ let SchedModel = SwiftModel in {
(instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
"PUSH", "tPUSH")>;
+ // LDRLIT pseudo instructions, they expand to LDR + PICADD
+ def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
+ (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+ // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+ def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle],
+ (instregex "LDRLIT_ga_pcrel_ldr")>;
+
// 4.2.26 Branch
def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 6cafbbb9f8eb..6fded9c8ab73 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -160,41 +160,39 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
unsigned VTSize = 4;
unsigned i = 0;
// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
- const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
+ const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
SDValue TFOps[6];
SDValue Loads[6];
uint64_t SrcOff = 0, DstOff = 0;
- // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
- // same number of stores. The loads and stores will get combined into
- // ldm/stm later on.
- while (EmittedNumMemOps < NumMemOps) {
- for (i = 0;
- i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
- Loads[i] = DAG.getLoad(VT, dl, Chain,
- DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
- DAG.getConstant(SrcOff, dl, MVT::i32)),
- SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
- false, false, 0);
- TFOps[i] = Loads[i].getValue(1);
- SrcOff += VTSize;
- }
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- makeArrayRef(TFOps, i));
+ // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
+ // VLDM/VSTM and make this code emit it when appropriate. This would reduce
+ // pressure on the general purpose registers. However this seems harder to map
+ // onto the register allocator's view of the world.
- for (i = 0;
- i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
- TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
- DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
- DAG.getConstant(DstOff, dl, MVT::i32)),
- DstPtrInfo.getWithOffset(DstOff),
- isVolatile, false, 0);
- DstOff += VTSize;
- }
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- makeArrayRef(TFOps, i));
+ // The number of MEMCPY pseudo-instructions to emit. We use up to
+ // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
+ // later on. This is a lower bound on the number of MEMCPY operations we must
+ // emit.
+ unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
+
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
+
+ for (unsigned I = 0; I != NumMEMCPYs; ++I) {
+ // Evenly distribute registers among MEMCPY operations to reduce register
+ // pressure.
+ unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
+ unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
+
+ Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+ DAG.getConstant(NumRegs, dl, MVT::i32));
+ Src = Dst.getValue(1);
+ Chain = Dst.getValue(2);
+
+ DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
+ SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
- EmittedNumMemOps += i;
+ EmittedNumMemOps = NextEmittedNumMemOps;
}
if (BytesLeft == 0)
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 002c3e9b6291..bb6ae28065bd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOptions.h"
@@ -40,37 +41,9 @@ using namespace llvm;
#include "ARMGenSubtargetInfo.inc"
static cl::opt<bool>
-ReserveR9("arm-reserve-r9", cl::Hidden,
- cl::desc("Reserve R9, making it unavailable as GPR"));
-
-static cl::opt<bool>
-ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
UseFusedMulOps("arm-use-mulops",
cl::init(true), cl::Hidden);
-namespace {
-enum AlignMode {
- DefaultAlign,
- StrictAlign,
- NoStrictAlign
-};
-}
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
- cl::Hidden, cl::init(DefaultAlign),
- cl::values(
- clEnumValN(DefaultAlign, "arm-default-align",
- "Generate unaligned accesses only on hardware/OS "
- "combinations that are known to support them"),
- clEnumValN(StrictAlign, "arm-strict-align",
- "Disallow all unaligned memory accesses"),
- clEnumValN(NoStrictAlign, "arm-no-strict-align",
- "Allow unaligned memory accesses"),
- clEnumValEnd));
-
enum ITMode {
DefaultIT,
RestrictedIT,
@@ -88,6 +61,12 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
"Allow IT blocks based on ARMv7"),
clEnumValEnd));
+/// ForceFastISel - Use the fast-isel, even for subtargets where it is not
+/// currently supported (for testing only).
+static cl::opt<bool>
+ForceFastISel("arm-force-fast-isel",
+ cl::init(false), cl::Hidden);
+
/// initializeSubtargetDependencies - Initializes using a CPU and feature string
/// so that we can use initializer lists for subtarget initialization.
ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -110,8 +89,8 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const ARMBaseTargetMachine &TM, bool IsLittle)
: ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
- ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
- TargetTriple(TT), Options(TM.Options), TM(TM),
+ ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU),
+ IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
FrameLowering(initializeFrameLowering(CPU, FS)),
// At this point initializeSubtargetDependencies has been called so
// we can query directly.
@@ -133,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() {
HasV7Ops = false;
HasV8Ops = false;
HasV8_1aOps = false;
+ HasV8_2aOps = false;
HasVFPv2 = false;
HasVFPv3 = false;
HasVFPv4 = false;
@@ -147,10 +127,11 @@ void ARMSubtarget::initializeEnvironment() {
UseSoftFloat = false;
HasThumb2 = false;
NoARM = false;
- IsR9Reserved = ReserveR9;
- UseMovt = false;
+ ReserveR9 = false;
+ NoMovt = false;
SupportsTailCall = false;
HasFP16 = false;
+ HasFullFP16 = false;
HasD16 = false;
HasHardwareDivide = false;
HasHardwareDivideInARM = false;
@@ -168,20 +149,36 @@ void ARMSubtarget::initializeEnvironment() {
HasCrypto = false;
HasCRC = false;
HasZeroCycleZeroing = false;
- AllowsUnalignedMem = false;
- Thumb2DSP = false;
+ StrictAlign = false;
+ HasDSP = false;
UseNaClTrap = false;
GenLongCalls = false;
UnsafeFPMath = false;
+
+ // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
+ // directly from it, but we can try to make sure they're consistent when both
+ // available.
+ UseSjLjEH = isTargetDarwin() && !isTargetWatchOS();
+ assert((!TM.getMCAsmInfo() ||
+ (TM.getMCAsmInfo()->getExceptionHandlingType() ==
+ ExceptionHandling::SjLj) == UseSjLjEH) &&
+ "inconsistent sjlj choice between CodeGen and MC");
}
void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (CPUString.empty()) {
- if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s"))
- // Default to the Swift CPU when targeting armv7s/thumbv7s.
- CPUString = "swift";
- else
- CPUString = "generic";
+ CPUString = "generic";
+
+ if (isTargetDarwin()) {
+ StringRef ArchName = TargetTriple.getArchName();
+ if (ArchName.endswith("v7s"))
+ // Default to the Swift CPU when targeting armv7s/thumbv7s.
+ CPUString = "swift";
+ else if (ArchName.endswith("v7k"))
+ // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
+ // ARMv7k does not use SjLj exception handling.
+ CPUString = "cortex-a7";
+ }
}
// Insert the architecture feature derived from the target triple into the
@@ -212,44 +209,31 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isAAPCS_ABI())
stackAlignment = 8;
- if (isTargetNaCl())
+ if (isTargetNaCl() || isAAPCS16_ABI())
stackAlignment = 16;
- UseMovt = hasV6T2Ops() && ArmUseMOVT;
-
- if (isTargetMachO()) {
- IsR9Reserved = ReserveR9 || !HasV6Ops;
- SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0);
- } else {
- IsR9Reserved = ReserveR9;
- SupportsTailCall = !isThumb1Only();
- }
-
- if (Align == DefaultAlign) {
- // Assume pre-ARMv6 doesn't support unaligned accesses.
- //
- // ARMv6 may or may not support unaligned accesses depending on the
- // SCTLR.U bit, which is architecture-specific. We assume ARMv6
- // Darwin and NetBSD targets support unaligned accesses, and others don't.
- //
- // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
- // which raises an alignment fault on unaligned accesses. Linux
- // defaults this bit to 0 and handles it as a system-wide (not
- // per-process) setting. It is therefore safe to assume that ARMv7+
- // Linux targets support unaligned accesses. The same goes for NaCl.
- //
- // The above behavior is consistent with GCC.
- AllowsUnalignedMem =
- (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
- isTargetNetBSD())) ||
- (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
- } else {
- AllowsUnalignedMem = !(Align == StrictAlign);
- }
-
- // No v6M core supports unaligned memory access (v6M ARM ARM A3.2)
- if (isV6M())
- AllowsUnalignedMem = false;
+ // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
+ // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+ // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+ // support in the assembler and linker to be used. This would need to be
+ // fixed to fully support tail calls in Thumb1.
+ //
+ // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+ // LR. This means if we need to reload LR, it takes an extra instructions,
+ // which outweighs the value of the tail call; but here we don't know yet
+ // whether LR is going to be used. Probably the right approach is to
+ // generate the tail call here and turn it back into CALL/RET in
+ // emitEpilogue if LR is used.
+
+ // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+ // but we need to make sure there are enough registers; the only valid
+ // registers are the 4 used for parameters. We don't currently do this
+ // case.
+
+ SupportsTailCall = !isThumb1Only();
+
+ if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
+ SupportsTailCall = false;
switch (IT) {
case DefaultIT:
@@ -276,9 +260,15 @@ bool ARMSubtarget::isAPCS_ABI() const {
}
bool ARMSubtarget::isAAPCS_ABI() const {
assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
- return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+ TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+bool ARMSubtarget::isAAPCS16_ABI() const {
+ assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+ return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
}
+
/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
bool
ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
@@ -321,11 +311,23 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
}
bool ARMSubtarget::hasSinCos() const {
- return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
+ return isTargetWatchOS() ||
+ (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0));
+}
+
+bool ARMSubtarget::enableMachineScheduler() const {
+ // Enable the MachineScheduler before register allocation for out-of-order
+ // architectures where we do not use the PostRA scheduler anymore (for now
+ // restricted to swift).
+ return getSchedModel().isOutOfOrder() && isSwift();
}
// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
bool ARMSubtarget::enablePostRAScheduler() const {
+ // No need for PostRA scheduling on out of order CPUs (for now restricted to
+ // swift).
+ if (getSchedModel().isOutOfOrder() && isSwift())
+ return false;
return (!isThumb() || hasThumb2());
}
@@ -333,15 +335,30 @@ bool ARMSubtarget::enableAtomicExpand() const {
return hasAnyDataBarrier() && !isThumb1Only();
}
+bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+ // For general targets, the prologue can grow when VFPs are allocated with
+ // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
+ // format which it's more important to get right.
+ return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize());
+}
+
bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
// NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
// immediates as it is inherently position independent, and may be out of
// range otherwise.
- return UseMovt && (isTargetWindows() ||
- !MF.getFunction()->hasFnAttribute(Attribute::MinSize));
+ return !NoMovt && hasV6T2Ops() &&
+ (isTargetWindows() || !MF.getFunction()->optForMinSize());
}
bool ARMSubtarget::useFastISel() const {
+ // Enable fast-isel for any target, for testing only.
+ if (ForceFastISel)
+ return true;
+
+ // Limit fast-isel to the targets that are or have been tested.
+ if (!hasV6Ops())
+ return false;
+
// Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
return TM.Options.EnableFastISel &&
((isTargetMachO() && !isThumb1Only()) ||
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index dd101df9b63d..a8b28018f1b2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -43,11 +43,17 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
protected:
enum ARMProcFamilyEnum {
Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
- CortexA17, CortexR4, CortexR4F, CortexR5, Swift, CortexA53, CortexA57, Krait,
+ CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
+ CortexA57, CortexA72, Krait, Swift
};
enum ARMProcClassEnum {
None, AClass, RClass, MClass
};
+ enum ARMArchEnum {
+ ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
+ ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
+ ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
+ };
/// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
ARMProcFamilyEnum ARMProcFamily;
@@ -55,6 +61,9 @@ protected:
/// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
ARMProcClassEnum ARMProcClass;
+ /// ARMArch - ARM architecture
+ ARMArchEnum ARMArch;
+
/// HasV4TOps, HasV5TOps, HasV5TEOps,
/// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
/// Specify whether target support specific ARM ISA variants.
@@ -68,6 +77,7 @@ protected:
bool HasV7Ops;
bool HasV8Ops;
bool HasV8_1aOps;
+ bool HasV8_2aOps;
/// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
/// floating point ISAs are supported.
@@ -109,22 +119,24 @@ protected:
/// NoARM - True if subtarget does not support ARM mode execution.
bool NoARM;
- /// IsR9Reserved - True if R9 is a not available as general purpose register.
- bool IsR9Reserved;
+ /// ReserveR9 - True if R9 is not available as a general purpose register.
+ bool ReserveR9;
- /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
- /// imms (including global addresses).
- bool UseMovt;
+ /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
+ /// 32-bit imms (including global addresses).
+ bool NoMovt;
/// SupportsTailCall - True if the OS supports tail call. The dynamic linker
/// must be able to synthesize call stubs for interworking between ARM and
/// Thumb.
bool SupportsTailCall;
- /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
- /// only so far)
+ /// HasFP16 - True if subtarget supports half-precision FP conversions
bool HasFP16;
+ /// HasFullFP16 - True if subtarget supports half-precision FP operations
+ bool HasFullFP16;
+
/// HasD16 - True if subtarget is limited to 16 double precision
/// FP registers for VFPv3.
bool HasD16;
@@ -190,18 +202,18 @@ protected:
/// particularly effective at zeroing a VFP register.
bool HasZeroCycleZeroing;
- /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+ /// StrictAlign - If true, the subtarget disallows unaligned memory
/// accesses for some types. For details, see
/// ARMTargetLowering::allowsMisalignedMemoryAccesses().
- bool AllowsUnalignedMem;
+ bool StrictAlign;
/// RestrictIT - If true, the subtarget disallows generation of deprecated IT
/// blocks to conform to ARMv8 rule.
bool RestrictIT;
- /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
- /// and such) instructions in Thumb2 code.
- bool Thumb2DSP;
+ /// HasDSP - If true, the subtarget supports the DSP (saturating arith
+ /// and such) instructions.
+ bool HasDSP;
/// NaCl TRAP instruction is generated instead of the regular TRAP.
bool UseNaClTrap;
@@ -212,6 +224,9 @@ protected:
/// Target machine allowed unsafe FP math (such as use of NEON fp)
bool UnsafeFPMath;
+ /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
+ bool UseSjLjEH;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
@@ -297,6 +312,7 @@ public:
bool hasV7Ops() const { return HasV7Ops; }
bool hasV8Ops() const { return HasV8Ops; }
bool hasV8_1aOps() const { return HasV8_1aOps; }
+ bool hasV8_2aOps() const { return HasV8_2aOps; }
bool isCortexA5() const { return ARMProcFamily == CortexA5; }
bool isCortexA7() const { return ARMProcFamily == CortexA7; }
@@ -343,17 +359,20 @@ public:
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRAS() const { return HasRAS; }
bool hasMPExtension() const { return HasMPExtension; }
- bool hasThumb2DSP() const { return Thumb2DSP; }
+ bool hasDSP() const { return HasDSP; }
bool useNaClTrap() const { return UseNaClTrap; }
+ bool useSjLjEH() const { return UseSjLjEH; }
bool genLongCalls() const { return GenLongCalls; }
bool hasFP16() const { return HasFP16; }
bool hasD16() const { return HasD16; }
+ bool hasFullFP16() const { return HasFullFP16; }
const Triple &getTargetTriple() const { return TargetTriple; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
bool isTargetIOS() const { return TargetTriple.isiOS(); }
+ bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -375,6 +394,11 @@ public:
TargetTriple.getEnvironment() == Triple::EABIHF) &&
!isTargetDarwin() && !isTargetWindows();
}
+ bool isTargetGNUAEABI() const {
+ return (TargetTriple.getEnvironment() == Triple::GNUEABI ||
+ TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
+ !isTargetDarwin() && !isTargetWindows();
+ }
// ARM Targets that support EHABI exception handling standard
// Darwin uses SjLj. Other targets might need more checks.
@@ -383,7 +407,7 @@ public:
TargetTriple.getEnvironment() == Triple::GNUEABI ||
TargetTriple.getEnvironment() == Triple::EABIHF ||
TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
- TargetTriple.getEnvironment() == Triple::Android) &&
+ isTargetAndroid()) &&
!isTargetDarwin() && !isTargetWindows();
}
@@ -391,14 +415,13 @@ public:
// FIXME: this is invalid for WindowsCE
return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
TargetTriple.getEnvironment() == Triple::EABIHF ||
- isTargetWindows();
- }
- bool isTargetAndroid() const {
- return TargetTriple.getEnvironment() == Triple::Android;
+ isTargetWindows() || isAAPCS16_ABI();
}
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
bool isAPCS_ABI() const;
bool isAAPCS_ABI() const;
+ bool isAAPCS16_ABI() const;
bool useSoftFloat() const { return UseSoftFloat; }
bool isThumb() const { return InThumbMode; }
@@ -409,17 +432,17 @@ public:
bool isRClass() const { return ARMProcClass == RClass; }
bool isAClass() const { return ARMProcClass == AClass; }
- bool isV6M() const {
- return isThumb1Only() && isMClass();
+ bool isR9Reserved() const {
+ return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
}
- bool isR9Reserved() const { return IsR9Reserved; }
+ bool useStride4VFPs(const MachineFunction &MF) const;
bool useMovt(const MachineFunction &MF) const;
bool supportsTailCall() const { return SupportsTailCall; }
- bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+ bool allowsUnalignedMem() const { return !StrictAlign; }
bool restrictIT() const { return RestrictIT; }
@@ -433,6 +456,9 @@ public:
/// compiler runtime or math libraries.
bool hasSinCos() const;
+ /// Returns true if machine scheduler should be enabled.
+ bool enableMachineScheduler() const override;
+
/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 93495d66ae70..fca1901dc57c 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -66,7 +66,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
static ARMBaseTargetMachine::ARMABI
computeTargetABI(const Triple &TT, StringRef CPU,
const TargetOptions &Options) {
- if (Options.MCOptions.getABIName().startswith("aapcs"))
+ if (Options.MCOptions.getABIName() == "aapcs16")
+ return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+ else if (Options.MCOptions.getABIName().startswith("aapcs"))
return ARMBaseTargetMachine::ARM_ABI_AAPCS;
else if (Options.MCOptions.getABIName().startswith("apcs"))
return ARMBaseTargetMachine::ARM_ABI_APCS;
@@ -83,6 +85,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
(TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
CPU.startswith("cortex-m")) {
TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ } else if (TT.isWatchOS()) {
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
} else {
TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
}
@@ -106,7 +110,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
if (TT.isOSNetBSD())
TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
else
- TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+ TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
break;
}
}
@@ -145,7 +149,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
// to 64. We always ty to give them natural alignment.
if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
Ret += "-v64:32:64-v128:32:128";
- else
+ else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
Ret += "-v128:64:128";
// Try to align aggregates to 32 bits (the default is 64 bits, which has no
@@ -157,7 +161,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
// The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
// aligned everywhere else.
- if (TT.isOSNaCl())
+ if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
Ret += "-S128";
else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
Ret += "-S64";
@@ -184,6 +188,15 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
if (Options.FloatABIType == FloatABI::Default)
this->Options.FloatABIType =
Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+
+ // Default to triple-appropriate EABI
+ if (Options.EABIVersion == EABI::Default ||
+ Options.EABIVersion == EABI::Unknown) {
+ if (Subtarget.isTargetGNUAEABI())
+ this->Options.EABIVersion = EABI::GNU;
+ else
+ this->Options.EABIVersion = EABI::EABI5;
+ }
}
ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
@@ -225,12 +238,12 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
}
TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis(
- [this](Function &F) { return TargetTransformInfo(ARMTTIImpl(this, F)); });
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(ARMTTIImpl(this, F));
+ });
}
-
-void ARMTargetMachine::anchor() { }
+void ARMTargetMachine::anchor() {}
ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -244,7 +257,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
"support ARM mode execution!");
}
-void ARMLETargetMachine::anchor() { }
+void ARMLETargetMachine::anchor() {}
ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -253,7 +266,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL)
: ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-void ARMBETargetMachine::anchor() { }
+void ARMBETargetMachine::anchor() {}
ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -262,7 +275,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL)
: ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-void ThumbTargetMachine::anchor() { }
+void ThumbTargetMachine::anchor() {}
ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -273,7 +286,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
initAsmInfo();
}
-void ThumbLETargetMachine::anchor() { }
+void ThumbLETargetMachine::anchor() {}
ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -282,7 +295,7 @@ ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL)
: ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-void ThumbBETargetMachine::anchor() { }
+void ThumbBETargetMachine::anchor() {}
ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -348,7 +361,13 @@ bool ARMPassConfig::addPreISel() {
// tricky when doing code gen per function.
bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
(EnableGlobalMerge == cl::BOU_UNSET);
- addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize));
+ // Merging of extern globals is enabled by default on non-Mach-O as we
+ // expect it to be generally either beneficial or harmless. On Mach-O it
+ // is disabled as we emit the .subsections_via_symbols directive which
+ // means that merging extern globals is not safe.
+ bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
+ addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize,
+ MergeExternalByDefault));
}
return false;
@@ -356,9 +375,6 @@ bool ARMPassConfig::addPreISel() {
bool ARMPassConfig::addInstSelector() {
addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
-
- if (TM->getTargetTriple().isOSBinFormatELF() && TM->Options.EnableFastISel)
- addPass(createARMGlobalBaseRegPass());
return false;
}
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 8c98e082ce9a..8ad1f3dc2c34 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -26,7 +26,8 @@ public:
enum ARMABI {
ARM_ABI_UNKNOWN,
ARM_ABI_APCS,
- ARM_ABI_AAPCS // ARM EABI
+ ARM_ABI_AAPCS, // ARM EABI
+ ARM_ABI_AAPCS16
} TargetABI;
protected:
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2f194cf7ae06..c1520119ef21 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
-unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -47,12 +47,12 @@ unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
return 3;
}
-unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// Single to/from double precision conversions.
- static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
+ static const CostTblEntry NEONFltDblTbl[] = {
// Vector fptrunc/fpext conversions.
{ ISD::FP_ROUND, MVT::v2f64, 2 },
{ ISD::FP_EXTEND, MVT::v2f32, 2 },
@@ -61,10 +61,9 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
ISD == ISD::FP_EXTEND)) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
- if (Idx != -1)
- return LT.first * NEONFltDblTbl[Idx].Cost;
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
EVT SrcTy = TLI->getValueType(DL, Src);
@@ -76,8 +75,7 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- NEONVectorConversionTbl[] = {
+ static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
@@ -153,15 +151,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
};
if (SrcTy.isVector() && ST->hasNEON()) {
- int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
- DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return NEONVectorConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
// Scalar float to integer conversions.
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- NEONFloatConversionTbl[] = {
+ static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
{ ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
{ ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
{ ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
@@ -184,15 +181,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
};
if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
- int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
- DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return NEONFloatConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
// Scalar integer to float conversions.
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- NEONIntegerConversionTbl[] = {
+ static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
{ ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
{ ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
@@ -216,15 +212,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
};
if (SrcTy.isInteger() && ST->hasNEON()) {
- int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
- DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return NEONIntegerConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
+ ISD, DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
// Scalar integer conversion costs.
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- ARMIntegerConversionTbl[] = {
+ static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
// i16 -> i64 requires two dependent operations.
{ ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
@@ -236,17 +231,17 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
};
if (SrcTy.isInteger()) {
- int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
- DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return ARMIntegerConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
// Penalize inserting into an D-subregister. We end up with a three times
// lower estimated throughput on swift.
if (ST->isSwift() &&
@@ -255,28 +250,30 @@ unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
ValTy->getScalarSizeInBits() <= 32)
return 3;
- // Cross-class copies are expensive on many microarchitectures,
- // so assume they are expensive by default.
if ((Opcode == Instruction::InsertElement ||
- Opcode == Instruction::ExtractElement) &&
- ValTy->getVectorElementType()->isIntegerTy())
- return 3;
+ Opcode == Instruction::ExtractElement)) {
+ // Cross-class copies are expensive on many microarchitectures,
+ // so assume they are expensive by default.
+ if (ValTy->getVectorElementType()->isIntegerTy())
+ return 3;
+
+ // Even if it's not a cross class copy, this likely leads to mixing
+ // of NEON and VFP code and should be therefore penalized.
+ if (ValTy->isVectorTy() &&
+ ValTy->getScalarSizeInBits() <= 32)
+ return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+ }
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
-unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// On NEON a a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
// Lowering of some vector selects is currently far from perfect.
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- NEONVectorSelectTbl[] = {
- { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
- { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
- { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
+ static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
@@ -285,21 +282,20 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
EVT SelCondTy = TLI->getValueType(DL, CondTy);
EVT SelValTy = TLI->getValueType(DL, ValTy);
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
- int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
- SelCondTy.getSimpleVT(),
- SelValTy.getSimpleVT());
- if (Idx != -1)
- return NEONVectorSelectTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+ SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT()))
+ return Entry->Cost;
}
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
return LT.first;
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -314,7 +310,7 @@ unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
return 1;
}
-unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
+int ARMTTIImpl::getFPOpCost(Type *Ty) {
// Use similar logic that's in ARMISelLowering:
// Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
// to VFP.
@@ -333,14 +329,14 @@ unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
return TargetTransformInfo::TCC_Expensive;
}
-unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
// We only handle costs of reverse and alternate shuffles for now.
if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
if (Kind == TTI::SK_Reverse) {
- static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+ static const CostTblEntry NEONShuffleTbl[] = {
// Reverse shuffle cost one instruction if we are shuffling within a
// double word (vrev) or two if we shuffle a quad word (vrev, vext).
{ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
@@ -353,16 +349,16 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx == -1)
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
- return LT.first * NEONShuffleTbl[Idx].Cost;
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
if (Kind == TTI::SK_Alternate) {
- static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+ static const CostTblEntry NEONAltShuffleTbl[] = {
// Alt shuffle cost table for ARM. Cost is the number of instructions
// required to create the shuffled vector.
@@ -379,27 +375,26 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- int Idx =
- CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx == -1)
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- return LT.first * NEONAltShuffleTbl[Idx].Cost;
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-unsigned ARMTTIImpl::getArithmeticInstrCost(
+int ARMTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
const unsigned FunctionCallDivCost = 20;
const unsigned ReciprocalDivCost = 10;
- static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
+ static const CostTblEntry CostTbl[] = {
// Division.
// These costs are somewhat random. Choose a cost of 20 to indicate that
// vectorizing devision (added function call) is going to be very expensive.
@@ -440,16 +435,12 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
// Multiplication.
};
- int Idx = -1;
-
if (ST->hasNEON())
- Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
-
- if (Idx != -1)
- return LT.first * CostTbl[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
+ return LT.first * Entry->Cost;
- unsigned Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
- Opd1PropInfo, Opd2PropInfo);
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
// This is somewhat of a hack. The problem that we are facing is that SROA
// creates a sequence of shift, and, or instructions to construct values.
@@ -465,10 +456,9 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
return Cost;
}
-unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
if (Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isDoubleTy()) {
@@ -479,21 +469,21 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
-unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
// vldN/vstN doesn't support vector types of i64/f64 element.
- bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+ bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
unsigned NumElts = VecTy->getVectorNumElements();
Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
- unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 84f256f73722..7d8d2381c983 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -41,7 +41,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
const ARMTargetLowering *getTLI() const { return TLI; }
public:
- explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F)
+ explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -52,11 +52,13 @@ public:
: BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
TLI(std::move(Arg.TLI)) {}
+ bool enableInterleavedAccessVectorization() { return true; }
+
/// \name Scalar TTI Implementations
/// @{
using BaseT::getIntImmCost;
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
/// @}
@@ -92,34 +94,31 @@ public:
return 1;
}
- unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp);
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getAddressComputationCost(Type *Val, bool IsComplex);
+ int getAddressComputationCost(Type *Val, bool IsComplex);
- unsigned getFPOpCost(Type *Ty);
+ int getFPOpCost(Type *Ty);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
- unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index cf6b8929f311..c69a741244cf 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -129,7 +129,6 @@ public:
};
class ARMAsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
const MCInstrInfo &MII;
const MCRegisterInfo *MRI;
UnwindContext UC;
@@ -247,48 +246,49 @@ class ARMAsmParser : public MCTargetAsmParser {
OperandVector &Operands);
bool isThumb() const {
// FIXME: Can tablegen auto-generate this?
- return STI.getFeatureBits()[ARM::ModeThumb];
+ return getSTI().getFeatureBits()[ARM::ModeThumb];
}
bool isThumbOne() const {
- return isThumb() && !STI.getFeatureBits()[ARM::FeatureThumb2];
+ return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2];
}
bool isThumbTwo() const {
- return isThumb() && STI.getFeatureBits()[ARM::FeatureThumb2];
+ return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2];
}
bool hasThumb() const {
- return STI.getFeatureBits()[ARM::HasV4TOps];
+ return getSTI().getFeatureBits()[ARM::HasV4TOps];
}
bool hasV6Ops() const {
- return STI.getFeatureBits()[ARM::HasV6Ops];
+ return getSTI().getFeatureBits()[ARM::HasV6Ops];
}
bool hasV6MOps() const {
- return STI.getFeatureBits()[ARM::HasV6MOps];
+ return getSTI().getFeatureBits()[ARM::HasV6MOps];
}
bool hasV7Ops() const {
- return STI.getFeatureBits()[ARM::HasV7Ops];
+ return getSTI().getFeatureBits()[ARM::HasV7Ops];
}
bool hasV8Ops() const {
- return STI.getFeatureBits()[ARM::HasV8Ops];
+ return getSTI().getFeatureBits()[ARM::HasV8Ops];
}
bool hasARM() const {
- return !STI.getFeatureBits()[ARM::FeatureNoARM];
+ return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
}
- bool hasThumb2DSP() const {
- return STI.getFeatureBits()[ARM::FeatureDSPThumb2];
+ bool hasDSP() const {
+ return getSTI().getFeatureBits()[ARM::FeatureDSP];
}
bool hasD16() const {
- return STI.getFeatureBits()[ARM::FeatureD16];
+ return getSTI().getFeatureBits()[ARM::FeatureD16];
}
bool hasV8_1aOps() const {
- return STI.getFeatureBits()[ARM::HasV8_1aOps];
+ return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
}
void SwitchMode() {
+ MCSubtargetInfo &STI = copySTI();
uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
setAvailableFeatures(FB);
}
bool isMClass() const {
- return STI.getFeatureBits()[ARM::FeatureMClass];
+ return getSTI().getFeatureBits()[ARM::FeatureMClass];
}
/// @name Auto-generated Match Functions
@@ -343,14 +343,15 @@ public:
Match_RequiresNotITBlock,
Match_RequiresV6,
Match_RequiresThumb2,
+ Match_RequiresV8,
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "ARMGenAsmMatcher.inc"
};
- ARMAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser,
+ ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : STI(STI), MII(MII), UC(Parser) {
+ : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) {
MCAsmParserExtension::Initialize(Parser);
// Cache the MCRegisterInfo.
@@ -564,87 +565,6 @@ class ARMOperand : public MCParsedAsmOperand {
public:
ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
- ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
- Kind = o.Kind;
- StartLoc = o.StartLoc;
- EndLoc = o.EndLoc;
- switch (Kind) {
- case k_CondCode:
- CC = o.CC;
- break;
- case k_ITCondMask:
- ITMask = o.ITMask;
- break;
- case k_Token:
- Tok = o.Tok;
- break;
- case k_CCOut:
- case k_Register:
- Reg = o.Reg;
- break;
- case k_RegisterList:
- case k_DPRRegisterList:
- case k_SPRRegisterList:
- Registers = o.Registers;
- break;
- case k_VectorList:
- case k_VectorListAllLanes:
- case k_VectorListIndexed:
- VectorList = o.VectorList;
- break;
- case k_CoprocNum:
- case k_CoprocReg:
- Cop = o.Cop;
- break;
- case k_CoprocOption:
- CoprocOption = o.CoprocOption;
- break;
- case k_Immediate:
- Imm = o.Imm;
- break;
- case k_MemBarrierOpt:
- MBOpt = o.MBOpt;
- break;
- case k_InstSyncBarrierOpt:
- ISBOpt = o.ISBOpt;
- case k_Memory:
- Memory = o.Memory;
- break;
- case k_PostIndexRegister:
- PostIdxReg = o.PostIdxReg;
- break;
- case k_MSRMask:
- MMask = o.MMask;
- break;
- case k_BankedReg:
- BankedReg = o.BankedReg;
- break;
- case k_ProcIFlags:
- IFlags = o.IFlags;
- break;
- case k_ShifterImmediate:
- ShifterImm = o.ShifterImm;
- break;
- case k_ShiftedRegister:
- RegShiftedReg = o.RegShiftedReg;
- break;
- case k_ShiftedImmediate:
- RegShiftedImm = o.RegShiftedImm;
- break;
- case k_RotateImmediate:
- RotImm = o.RotImm;
- break;
- case k_ModifiedImmediate:
- ModImm = o.ModImm;
- break;
- case k_BitfieldDescriptor:
- Bitfield = o.Bitfield;
- break;
- case k_VectorIndex:
- VectorIndex = o.VectorIndex;
- break;
- }
- }
/// getStartLoc - Get the location of the first token of this operand.
SMLoc getStartLoc() const override { return StartLoc; }
@@ -4054,7 +3974,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
if (FlagsVal == ~0U)
return MatchOperand_NoMatch;
- if (!hasThumb2DSP() && (FlagsVal & 0x400))
+ if (!hasDSP() && (FlagsVal & 0x400))
// The _g and _nzcvqg versions are only valid if the DSP extension is
// available.
return MatchOperand_NoMatch;
@@ -5202,6 +5122,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
// FALLTHROUGH
}
case AsmToken::Colon: {
+ S = Parser.getTok().getLoc();
// ":lower16:" and ":upper16:" expression prefixes
// FIXME: Check it's an expression prefix,
// e.g. (FOO - :lower16:BAR) isn't legal.
@@ -5220,8 +5141,9 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
return false;
}
case AsmToken::Equal: {
+ S = Parser.getTok().getLoc();
if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
- return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+ return Error(S, "unexpected token in operand");
Parser.Lex(); // Eat '='
const MCExpr *SubExprVal;
@@ -5229,7 +5151,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
return true;
E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
- const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+ const MCExpr *CPLoc =
+ getTargetStreamer().addConstantPoolEntry(SubExprVal, S);
Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
return false;
}
@@ -5682,9 +5605,11 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
// VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
unsigned RegIdx = 3;
if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
- static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") {
+ (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
+ static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
- static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32")
+ (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
+ static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
RegIdx = 4;
if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
@@ -8610,18 +8535,29 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
inITBlock())
return Match_RequiresNotITBlock;
+ } else if (isThumbOne()) {
+ // Some high-register supporting Thumb1 encodings only allow both registers
+ // to be from r0-r7 when in Thumb2.
+ if (Opc == ARM::tADDhirr && !hasV6MOps() &&
+ isARMLowRegister(Inst.getOperand(1).getReg()) &&
+ isARMLowRegister(Inst.getOperand(2).getReg()))
+ return Match_RequiresThumb2;
+ // Others only require ARMv6 or later.
+ else if (Opc == ARM::tMOVr && !hasV6Ops() &&
+ isARMLowRegister(Inst.getOperand(0).getReg()) &&
+ isARMLowRegister(Inst.getOperand(1).getReg()))
+ return Match_RequiresV6;
}
- // Some high-register supporting Thumb1 encodings only allow both registers
- // to be from r0-r7 when in Thumb2.
- else if (Opc == ARM::tADDhirr && isThumbOne() && !hasV6MOps() &&
- isARMLowRegister(Inst.getOperand(1).getReg()) &&
- isARMLowRegister(Inst.getOperand(2).getReg()))
- return Match_RequiresThumb2;
- // Others only require ARMv6 or later.
- else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
- isARMLowRegister(Inst.getOperand(0).getReg()) &&
- isARMLowRegister(Inst.getOperand(1).getReg()))
- return Match_RequiresV6;
+
+ for (unsigned I = 0; I < MCID.NumOperands; ++I)
+ if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
+ // rGPRRegClass excludes PC, and also excluded SP before ARMv8
+ if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+ return Match_RequiresV8;
+ else if (Inst.getOperand(I).getReg() == ARM::PC)
+ return Match_InvalidOperand;
+ }
+
return Match_Success;
}
@@ -8680,7 +8616,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return false;
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature: {
assert(ErrorInfo && "Unknown missing feature!");
@@ -8720,6 +8656,8 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, "instruction variant requires ARMv6 or later");
case Match_RequiresThumb2:
return Error(IDLoc, "instruction variant requires Thumb2");
+ case Match_RequiresV8:
+ return Error(IDLoc, "instruction variant requires ARMv8 or later");
case Match_ImmRange0_15: {
SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
@@ -8868,7 +8806,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
return false;
}
- getParser().getStreamer().EmitValue(Value, Size);
+ getParser().getStreamer().EmitValue(Value, Size, L);
if (getLexer().is(AsmToken::EndOfStatement))
break;
@@ -9098,7 +9036,7 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
StringRef Arch = getParser().parseStringToEndOfStatement().trim();
- unsigned ID = ARMTargetParser::parseArch(Arch);
+ unsigned ID = ARM::parseArch(Arch);
if (ID == ARM::AK_INVALID) {
Error(L, "Unknown arch name");
@@ -9106,7 +9044,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
}
Triple T;
- STI.setDefaultFeatures(T.getARMCPUForArch(Arch));
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
getTargetStreamer().emitArch(ID);
@@ -9233,12 +9172,13 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
// FIXME: This is using table-gen data, but should be moved to
// ARMTargetParser once that is table-gen'd.
- if (!STI.isCPUStringValid(CPU)) {
+ if (!getSTI().isCPUStringValid(CPU)) {
Error(L, "Unknown CPU name");
return false;
}
- STI.setDefaultFeatures(CPU);
+ MCSubtargetInfo &STI = copySTI();
+ STI.setDefaultFeatures(CPU, "");
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
return false;
@@ -9249,13 +9189,14 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
SMLoc FPUNameLoc = getTok().getLoc();
StringRef FPU = getParser().parseStringToEndOfStatement().trim();
- unsigned ID = ARMTargetParser::parseFPU(FPU);
+ unsigned ID = ARM::parseFPU(FPU);
std::vector<const char *> Features;
- if (!ARMTargetParser::getFPUFeatures(ID, Features)) {
+ if (!ARM::getFPUFeatures(ID, Features)) {
Error(FPUNameLoc, "Unknown FPU name");
return false;
}
+ MCSubtargetInfo &STI = copySTI();
for (auto Feature : Features)
STI.ApplyFeatureFlag(Feature);
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -9895,7 +9836,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
SMLoc ArchLoc = Parser.getTok().getLoc();
getLexer().Lex();
- unsigned ID = ARMTargetParser::parseArch(Arch);
+ unsigned ID = ARM::parseArch(Arch);
if (ID == ARM::AK_INVALID) {
Error(ArchLoc, "unknown architecture '" + Arch + "'");
@@ -9976,22 +9917,22 @@ extern "C" void LLVMInitializeARMAsmParser() {
// when we start to table-generate them, and we can use the ARM
// flags below, that were generated by table-gen.
static const struct {
- const ARM::ArchExtKind Kind;
- const unsigned ArchCheck;
+ const unsigned Kind;
+ const uint64_t ArchCheck;
const FeatureBitset Features;
} Extensions[] = {
{ ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
{ ARM::AEK_CRYPTO, Feature_HasV8,
{ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
{ ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
- { ARM::AEK_HWDIV, Feature_HasV7 | Feature_IsNotMClass,
+ { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
{ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
{ ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
{ ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
- // FIXME: Also available in ARMv6-K
- { ARM::AEK_SEC, Feature_HasV7, {ARM::FeatureTrustZone} },
+ { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
// FIXME: Only available in A-class, isel not predicated
{ ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+ { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
// FIXME: Unsupported extensions.
{ ARM::AEK_OS, Feature_None, {} },
{ ARM::AEK_IWMMXT, Feature_None, {} },
@@ -10020,7 +9961,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
EnableFeature = false;
Name = Name.substr(2);
}
- unsigned FeatureKind = ARMTargetParser::parseArchExt(Name);
+ unsigned FeatureKind = ARM::parseArchExt(Name);
if (FeatureKind == ARM::AEK_INVALID)
Error(ExtLoc, "unknown architectural extension: " + Name);
@@ -10037,6 +9978,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
return false;
}
+ MCSubtargetInfo &STI = copySTI();
FeatureBitset ToggleFeatures = EnableFeature
? (~STI.getFeatureBits() & Extension.Features)
: ( STI.getFeatureBits() & Extension.Features);
@@ -10078,6 +10020,10 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
"expression value must be representable in 32 bits");
}
break;
+ case MCK_rGPR:
+ if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
+ return Match_Success;
+ break;
case MCK_GPRPair:
if (Op.isReg() &&
MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 097ec04e7052..e63defed2288 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -59,7 +59,7 @@ namespace {
}
// Called when decoding an IT instruction. Sets the IT state for the following
- // instructions that for the IT block. Firstcond and Mask correspond to the
+ // instructions that for the IT block. Firstcond and Mask correspond to the
// fields in the IT instruction encoding.
void setITState(char Firstcond, char Mask) {
// (3 - the number of trailing zeros) is the number of then / else.
@@ -459,21 +459,18 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// VFP and NEON instructions, similarly, are shared between ARM
// and Thumb modes.
- MI.clear();
Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
return Result;
}
- MI.clear();
Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -485,7 +482,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
this, STI);
if (Result != MCDisassembler::Fail) {
@@ -497,7 +493,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -509,7 +504,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -517,7 +511,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -525,7 +518,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Size = 0;
return MCDisassembler::Fail;
}
@@ -718,7 +710,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
STI);
if (Result) {
@@ -729,7 +720,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -763,7 +753,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
uint32_t Insn32 =
(Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
- MI.clear();
Result =
decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -774,7 +763,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
Result =
decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -784,7 +772,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
- MI.clear();
Result =
decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -794,7 +781,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- MI.clear();
Result =
decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
@@ -803,7 +789,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
- MI.clear();
Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
STI);
if (Result != MCDisassembler::Fail) {
@@ -814,7 +799,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
- MI.clear();
uint32_t NEONLdStInsn = Insn32;
NEONLdStInsn &= 0xF0FFFFFF;
NEONLdStInsn |= 0x04000000;
@@ -828,7 +812,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
- MI.clear();
uint32_t NEONDataInsn = Insn32;
NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
@@ -841,7 +824,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
uint32_t NEONCryptoInsn = Insn32;
NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
@@ -853,7 +835,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return Result;
}
- MI.clear();
uint32_t NEONv8Insn = Insn32;
NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
@@ -864,7 +845,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- MI.clear();
Size = 0;
return MCDisassembler::Fail;
}
@@ -902,7 +882,7 @@ static DecodeStatus
DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
-
+
if (RegNo == 15)
S = MCDisassembler::SoftFail;
@@ -986,8 +966,13 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
- if (RegNo == 13 || RegNo == 15)
+
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+ if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15)
S = MCDisassembler::SoftFail;
+
Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
return S;
}
@@ -1147,7 +1132,7 @@ static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
unsigned imm = fieldFromInstruction(Val, 7, 5);
// Register-immediate
- if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+ if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
return MCDisassembler::Fail;
ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
@@ -1658,7 +1643,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
case ARM::STRD_POST:
if (P == 0 && W == 1)
S = MCDisassembler::SoftFail;
-
+
if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
S = MCDisassembler::SoftFail;
if (type && Rm == 15)
@@ -4131,7 +4116,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
// indicates the move for the GE{3:0} bits, the mask{0} bit can be set
// only if the processor includes the DSP extension.
if (Mask == 0 || (Mask != 2 && ValLow > 3) ||
- (!(FeatureBits[ARM::FeatureDSPThumb2]) && (Mask & 1)))
+ (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1)))
S = MCDisassembler::SoftFail;
}
}
@@ -5065,6 +5050,10 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
@@ -5075,10 +5064,35 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
DecodeStatus S = MCDisassembler::Success;
- // VMOVv2f32 is ambiguous with these decodings.
- if (!(imm & 0x38) && cmode == 0xF) {
- if (op == 1) return MCDisassembler::Fail;
- Inst.setOpcode(ARM::VMOVv2f32);
+ // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+ if (!(imm & 0x38)) {
+ if (cmode == 0xF) {
+ if (op == 1) return MCDisassembler::Fail;
+ Inst.setOpcode(ARM::VMOVv2f32);
+ }
+ if (hasFullFP16) {
+ if (cmode == 0xE) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMOVv1i64);
+ } else {
+ Inst.setOpcode(ARM::VMOVv8i8);
+ }
+ }
+ if (cmode == 0xD) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv2i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv2i32);
+ }
+ }
+ if (cmode == 0xC) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv2i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv2i32);
+ }
+ }
+ }
return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
}
@@ -5095,6 +5109,10 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
+ const FeatureBitset &featureBits =
+ ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+ bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
@@ -5105,10 +5123,35 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
DecodeStatus S = MCDisassembler::Success;
- // VMOVv4f32 is ambiguous with these decodings.
- if (!(imm & 0x38) && cmode == 0xF) {
- if (op == 1) return MCDisassembler::Fail;
- Inst.setOpcode(ARM::VMOVv4f32);
+ // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+ if (!(imm & 0x38)) {
+ if (cmode == 0xF) {
+ if (op == 1) return MCDisassembler::Fail;
+ Inst.setOpcode(ARM::VMOVv4f32);
+ }
+ if (hasFullFP16) {
+ if (cmode == 0xE) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMOVv2i64);
+ } else {
+ Inst.setOpcode(ARM::VMOVv16i8);
+ }
+ }
+ if (cmode == 0xD) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv4i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv4i32);
+ }
+ }
+ if (cmode == 0xC) {
+ if (op == 1) {
+ Inst.setOpcode(ARM::VMVNv4i32);
+ } else {
+ Inst.setOpcode(ARM::VMOVv4i32);
+ }
+ }
+ }
return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
}
@@ -5132,7 +5175,7 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
unsigned Rm = fieldFromInstruction(Val, 0, 4);
Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
unsigned Cond = fieldFromInstruction(Val, 28, 4);
-
+
if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
S = MCDisassembler::SoftFail;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 0bff52141da5..33fc85af9b19 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -804,7 +805,7 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
unsigned Opcode = MI->getOpcode();
// For writes, handle extended mask bits if the DSP extension is present.
- if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSPThumb2]) {
+ if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) {
switch (SYSm) {
case 0x400:
O << "apsr_g";
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 3927c9f8bfd3..52f7115f0558 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -15,12 +15,9 @@
#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
-class MCOperand;
-
class ARMInstPrinter : public MCInstPrinter {
public:
ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 111463588565..fa52c9354c17 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -25,13 +25,17 @@
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ELF.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -180,9 +184,8 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
return false;
}
-bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const {
+const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
+ uint64_t Value) const {
switch ((unsigned)Fixup.getKind()) {
case ARM::fixup_arm_thumb_br: {
// Relaxing tB to t2B. tB has a signed 12-bit displacement with the
@@ -192,7 +195,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
//
// Relax if the value is too big for a (signed) i8.
int64_t Offset = int64_t(Value) - 4;
- return Offset > 2046 || Offset < -2048;
+ if (Offset > 2046 || Offset < -2048)
+ return "out of range pc-relative fixup value";
+ break;
}
case ARM::fixup_arm_thumb_bcc: {
// Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
@@ -202,23 +207,40 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
//
// Relax if the value is too big for a (signed) i8.
int64_t Offset = int64_t(Value) - 4;
- return Offset > 254 || Offset < -256;
+ if (Offset > 254 || Offset < -256)
+ return "out of range pc-relative fixup value";
+ break;
}
case ARM::fixup_thumb_adr_pcrel_10:
case ARM::fixup_arm_thumb_cp: {
// If the immediate is negative, greater than 1020, or not a multiple
// of four, the wide version of the instruction must be used.
int64_t Offset = int64_t(Value) - 4;
- return Offset > 1020 || Offset < 0 || Offset & 3;
+ if (Offset & 3)
+ return "misaligned pc-relative fixup value";
+ else if (Offset > 1020 || Offset < 0)
+ return "out of range pc-relative fixup value";
+ break;
}
- case ARM::fixup_arm_thumb_cb:
+ case ARM::fixup_arm_thumb_cb: {
// If we have a Thumb CBZ or CBNZ instruction and its target is the next
// instruction it is is actually out of range for the instruction.
// It will be changed to a NOP.
int64_t Offset = (Value & ~1);
- return Offset == 2;
+ if (Offset == 2)
+ return "will be converted to nop";
+ break;
}
- llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!");
+ default:
+ llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!");
+ }
+ return nullptr;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ return reasonForFixupRelaxation(Fixup, Value);
}
void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
@@ -317,9 +339,10 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
return Value;
}
-static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
- bool IsPCRel, MCContext *Ctx,
- bool IsLittleEndian) {
+unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+ bool IsPCRel, MCContext *Ctx,
+ bool IsLittleEndian,
+ bool IsResolved) const {
unsigned Kind = Fixup.getKind();
switch (Kind) {
default:
@@ -372,8 +395,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
Value = -Value;
isAdd = false;
}
- if (Ctx && Value >= 4096)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ if (Ctx && Value >= 4096) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
Value |= isAdd << 23;
// Same addressing mode as fixup_arm_pcrel_10,
@@ -383,8 +408,6 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
return Value;
}
- case ARM::fixup_thumb_adr_pcrel_10:
- return ((Value - 4) >> 2) & 0xff;
case ARM::fixup_arm_adr_pcrel_12: {
// ARM PC-relative values are offset by 8.
Value -= 8;
@@ -393,8 +416,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
Value = -Value;
opc = 2; // 0b0010
}
- if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
// Encode the immediate and shift the opcode into place.
return ARM_AM::getSOImmVal(Value) | (opc << 21);
}
@@ -517,21 +542,44 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
((uint16_t)imm10LBits) << 1);
return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
}
+ case ARM::fixup_thumb_adr_pcrel_10:
case ARM::fixup_arm_thumb_cp:
- // Offset by 4, and don't encode the low two bits. Two bytes of that
- // 'off by 4' is implicitly handled by the half-word ordering of the
- // Thumb encoding, so we only need to adjust by 2 here.
- return ((Value - 2) >> 2) & 0xff;
+ // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
+ // could have an error on our hands.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
+ // Offset by 4, and don't encode the low two bits.
+ return ((Value - 4) >> 2) & 0xff;
case ARM::fixup_arm_thumb_cb: {
// Offset by 4 and don't encode the lower bit, which is always 0.
+ // FIXME: diagnose if no Thumb2
uint32_t Binary = (Value - 4) >> 1;
return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
}
case ARM::fixup_arm_thumb_br:
// Offset by 4 and don't encode the lower bit, which is always 0.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
return ((Value - 4) >> 1) & 0x7ff;
case ARM::fixup_arm_thumb_bcc:
// Offset by 4 and don't encode the lower bit, which is always 0.
+ if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+ const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+ if (FixupDiagnostic) {
+ Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+ return 0;
+ }
+ }
return ((Value - 4) >> 1) & 0xff;
case ARM::fixup_arm_pcrel_10_unscaled: {
Value = Value - 8; // ARM fixups offset by an additional word and don't
@@ -542,8 +590,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
isAdd = false;
}
// The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
- if (Ctx && Value >= 256)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ if (Ctx && Value >= 256) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
Value = (Value & 0xf) | ((Value & 0xf0) << 4);
return Value | (isAdd << 23);
}
@@ -561,8 +611,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
// These values don't encode the low two bits since they're always zero.
Value >>= 2;
- if (Ctx && Value >= 256)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ if (Ctx && Value >= 256) {
+ Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+ return 0;
+ }
Value |= isAdd << 23;
// Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
@@ -582,6 +634,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
const MCValue &Target, uint64_t &Value,
bool &IsResolved) {
const MCSymbolRefExpr *A = Target.getSymA();
+ const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
// Some fixups to thumb function symbols need the low bit (thumb bit)
// twiddled.
if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
@@ -590,18 +643,21 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
(unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
(unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
(unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
- if (A) {
- const MCSymbol &Sym = A->getSymbol();
- if (Asm.isThumbFunc(&Sym))
+ if (Sym) {
+ if (Asm.isThumbFunc(Sym))
Value |= 1;
}
}
- // For Thumb1 BL instruction, it is possible to be a long jump between
- // the basic blocks of the same function. Thus, we would like to resolve
- // the offset when the destination has the same MCFragment.
- if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
- const MCSymbol &Sym = A->getSymbol();
- IsResolved = (Sym.getFragment() == DF);
+ if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+ assert(Sym && "How did we resolve this?");
+
+ // If the symbol is external the linker will handle it.
+ // FIXME: Should we handle it as an optimization?
+
+ // If the symbol is out of range, produce a relocation and hope the
+ // linker can handle it. GNU AS produces an error in this case.
+ if (Sym->isExternal() || Value >= 0x400004)
+ IsResolved = false;
}
// We must always generate a relocation for BL/BLX instructions if we have
// a symbol to reference, as the linker relies on knowing the destination
@@ -616,7 +672,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
// the instruction. This allows adjustFixupValue() to issue a diagnostic
// if the value aren't invalid.
(void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
- IsLittleEndian);
+ IsLittleEndian, IsResolved);
}
/// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -719,7 +775,8 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value,
bool IsPCRel) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
- Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
+ Value =
+ adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
if (!Value)
return; // Doesn't change encoding.
@@ -743,6 +800,249 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
}
}
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+ UNWIND_ARM_MODE_MASK = 0x0F000000,
+ UNWIND_ARM_MODE_FRAME = 0x01000000,
+ UNWIND_ARM_MODE_FRAME_D = 0x02000000,
+ UNWIND_ARM_MODE_DWARF = 0x04000000,
+
+ UNWIND_ARM_FRAME_STACK_ADJUST_MASK = 0x00C00000,
+
+ UNWIND_ARM_FRAME_FIRST_PUSH_R4 = 0x00000001,
+ UNWIND_ARM_FRAME_FIRST_PUSH_R5 = 0x00000002,
+ UNWIND_ARM_FRAME_FIRST_PUSH_R6 = 0x00000004,
+
+ UNWIND_ARM_FRAME_SECOND_PUSH_R8 = 0x00000008,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R9 = 0x00000010,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R10 = 0x00000020,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R11 = 0x00000040,
+ UNWIND_ARM_FRAME_SECOND_PUSH_R12 = 0x00000080,
+
+ UNWIND_ARM_FRAME_D_REG_COUNT_MASK = 0x00000F00,
+
+ UNWIND_ARM_DWARF_SECTION_OFFSET = 0x00FFFFFF
+};
+
+} // end CU namespace
+
+/// Generate compact unwind encoding for the function based on the CFI
+/// instructions. If the CFI instructions describe a frame that cannot be
+/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
+/// tells the runtime to fallback and unwind using dwarf.
+uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const {
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
+ // Only armv7k uses CFI based unwinding.
+ if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K)
+ return 0;
+ // No .cfi directives means no frame.
+ if (Instrs.empty())
+ return 0;
+ // Start off assuming CFA is at SP+0.
+ int CFARegister = ARM::SP;
+ int CFARegisterOffset = 0;
+ // Mark savable registers as initially unsaved
+ DenseMap<unsigned, int> RegOffsets;
+ int FloatRegCount = 0;
+ // Process each .cfi directive and build up compact unwind info.
+ for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+ int Reg;
+ const MCCFIInstruction &Inst = Instrs[i];
+ switch (Inst.getOperation()) {
+ case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
+ CFARegisterOffset = -Inst.getOffset();
+ CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ break;
+ case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
+ CFARegisterOffset = -Inst.getOffset();
+ break;
+ case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
+ CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ break;
+ case MCCFIInstruction::OpOffset: // DW_CFA_offset
+ Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+ RegOffsets[Reg] = Inst.getOffset();
+ else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+ RegOffsets[Reg] = Inst.getOffset();
+ ++FloatRegCount;
+ } else {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << ".cfi_offset on unknown register="
+ << Inst.getRegister() << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ break;
+ case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc
+ // Ignore
+ break;
+ default:
+ // Directive not convertable to compact unwind, bail out.
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs()
+ << "CFI directive not compatiable with comact "
+ "unwind encoding, opcode=" << Inst.getOperation()
+ << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ break;
+ }
+ }
+
+ // If no frame set up, return no unwind info.
+ if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0))
+ return 0;
+
+ // Verify standard frame (lr/r7) was used.
+ if (CFARegister != ARM::R7) {
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
+ << CFARegister
+ << " instead of r7\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ int StackAdjust = CFARegisterOffset - 8;
+ if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs()
+ << "LR not saved as standard frame, StackAdjust="
+ << StackAdjust
+ << ", CFARegisterOffset=" << CFARegisterOffset
+ << ", lr save at offset=" << RegOffsets[14] << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << "r7 not saved as standard frame\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME;
+
+ // If var-args are used, there may be a stack adjust required.
+ switch (StackAdjust) {
+ case 0:
+ break;
+ case 4:
+ CompactUnwindEncoding |= 0x00400000;
+ break;
+ case 8:
+ CompactUnwindEncoding |= 0x00800000;
+ break;
+ case 12:
+ CompactUnwindEncoding |= 0x00C00000;
+ break;
+ default:
+ DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs()
+ << ".cfi_def_cfa stack adjust ("
+ << StackAdjust << ") out of range\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+
+ // If r6 is saved, it must be right below r7.
+ static struct {
+ unsigned Reg;
+ unsigned Encoding;
+ } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6},
+ {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5},
+ {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4},
+ {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12},
+ {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11},
+ {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10},
+ {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9},
+ {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}};
+
+ int CurOffset = -8 - StackAdjust;
+ for (auto CSReg : GPRCSRegs) {
+ auto Offset = RegOffsets.find(CSReg.Reg);
+ if (Offset == RegOffsets.end())
+ continue;
+
+ int RegOffset = Offset->second;
+ if (RegOffset != CurOffset - 4) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at "
+ << RegOffset << " but only supported at "
+ << CurOffset << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ CompactUnwindEncoding |= CSReg.Encoding;
+ CurOffset -= 4;
+ }
+
+ // If no floats saved, we are done.
+ if (FloatRegCount == 0)
+ return CompactUnwindEncoding;
+
+ // Switch mode to include D register saving.
+ CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK;
+ CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D;
+
+ // FIXME: supporting more than 4 saved D-registers compactly would be trivial,
+ // but needs coordination with the linker and libunwind.
+ if (FloatRegCount > 4) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << "unsupported number of D registers saved ("
+ << FloatRegCount << ")\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+
+ // Floating point registers must either be saved sequentially, or we defer to
+ // DWARF. No gaps allowed here so check that each saved d-register is
+ // precisely where it should be.
+ static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 };
+ for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) {
+ auto Offset = RegOffsets.find(FPRCSRegs[Idx]);
+ if (Offset == RegOffsets.end()) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+ << MRI.getName(FPRCSRegs[Idx])
+ << " not saved\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ } else if (Offset->second != CurOffset - 8) {
+ DEBUG_WITH_TYPE("compact-unwind",
+ llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+ << MRI.getName(FPRCSRegs[Idx])
+ << " saved at " << Offset->second
+ << ", expected at " << CurOffset - 8
+ << "\n");
+ return CU::UNWIND_ARM_MODE_DWARF;
+ }
+ CurOffset -= 8;
+ }
+
+ return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
+}
+
+static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
+ unsigned AK = ARM::parseArch(Arch);
+ switch (AK) {
+ default:
+ return MachO::CPU_SUBTYPE_ARM_V7;
+ case ARM::AK_ARMV4T:
+ return MachO::CPU_SUBTYPE_ARM_V4T;
+ case ARM::AK_ARMV5T:
+ case ARM::AK_ARMV5TE:
+ case ARM::AK_ARMV5TEJ:
+ return MachO::CPU_SUBTYPE_ARM_V5;
+ case ARM::AK_ARMV6:
+ case ARM::AK_ARMV6K:
+ return MachO::CPU_SUBTYPE_ARM_V6;
+ case ARM::AK_ARMV7A:
+ return MachO::CPU_SUBTYPE_ARM_V7;
+ case ARM::AK_ARMV7S:
+ return MachO::CPU_SUBTYPE_ARM_V7S;
+ case ARM::AK_ARMV7K:
+ return MachO::CPU_SUBTYPE_ARM_V7K;
+ case ARM::AK_ARMV6M:
+ return MachO::CPU_SUBTYPE_ARM_V6M;
+ case ARM::AK_ARMV7M:
+ return MachO::CPU_SUBTYPE_ARM_V7M;
+ case ARM::AK_ARMV7EM:
+ return MachO::CPU_SUBTYPE_ARM_V7EM;
+ }
+}
+
MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const Triple &TheTriple, StringRef CPU,
@@ -751,19 +1051,8 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
default:
llvm_unreachable("unsupported object format");
case Triple::MachO: {
- MachO::CPUSubTypeARM CS =
- StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
- .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
- .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
- .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
- .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
- .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
- .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
- .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
- .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
- .Default(MachO::CPU_SUBTYPE_ARM_V7);
-
- return new ARMAsmBackendDarwin(T, TheTriple, CS);
+ MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
+ return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS);
}
case Triple::COFF:
assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 6b4abd5898eb..28a62132a419 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -45,6 +45,10 @@ public:
const MCValue &Target, uint64_t &Value,
bool &IsResolved) override;
+ unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+ MCContext *Ctx, bool IsLittleEndian,
+ bool IsResolved) const;
+
void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value, bool IsPCRel) const override;
@@ -52,6 +56,9 @@ public:
bool mayNeedRelaxation(const MCInst &Inst) const override;
+ const char *reasonForFixupRelaxation(const MCFixup &Fixup,
+ uint64_t Value) const;
+
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index a6206e3d9585..995dd0fe08ee 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -16,11 +16,12 @@ using namespace llvm;
namespace {
class ARMAsmBackendDarwin : public ARMAsmBackend {
+ const MCRegisterInfo &MRI;
public:
const MachO::CPUSubTypeARM Subtype;
ARMAsmBackendDarwin(const Target &T, const Triple &TT,
- MachO::CPUSubTypeARM st)
- : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
+ const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
+ : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
HasDataInCodeSupport = true;
}
@@ -28,6 +29,9 @@ public:
return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
Subtype);
}
+
+ uint32_t generateCompactUnwindEncoding(
+ ArrayRef<MCCFIInstruction> Instrs) const override;
};
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 804d3534096a..52eba8be288f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -95,7 +95,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case MCSymbolRefExpr::VK_GOTTPOFF:
Type = ELF::R_ARM_TLS_IE32;
break;
- case MCSymbolRefExpr::VK_GOTPCREL:
+ case MCSymbolRefExpr::VK_ARM_GOT_PREL:
Type = ELF::R_ARM_GOT_PREL;
break;
}
@@ -192,7 +192,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case MCSymbolRefExpr::VK_GOTOFF:
Type = ELF::R_ARM_GOTOFF32;
break;
- case MCSymbolRefExpr::VK_GOTPCREL:
+ case MCSymbolRefExpr::VK_ARM_GOT_PREL:
Type = ELF::R_ARM_GOT_PREL;
break;
case MCSymbolRefExpr::VK_ARM_TARGET1:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index d17fdb95dbdf..6084f22c8470 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
void emitAttribute(unsigned Attribute, unsigned Value) override;
void emitTextAttribute(unsigned Attribute, StringRef String) override;
void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
- StringRef StrinValue) override;
+ StringRef StringValue) override;
void emitArch(unsigned Arch) override;
void emitArchExtension(unsigned ArchExt) override;
void emitObjectArch(unsigned Arch) override;
@@ -195,16 +195,16 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
OS << "\n";
}
void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
- OS << "\t.arch\t" << ARMTargetParser::getArchName(Arch) << "\n";
+ OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
}
void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
- OS << "\t.arch_extension\t" << ARMTargetParser::getArchExtName(ArchExt) << "\n";
+ OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
}
void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
- OS << "\t.object_arch\t" << ARMTargetParser::getArchName(Arch) << '\n';
+ OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
}
void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
- OS << "\t.fpu\t" << ARMTargetParser::getFPUName(FPU) << "\n";
+ OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
}
void ARMTargetAsmStreamer::finishAttributeSection() {
}
@@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
class ARMTargetELFStreamer : public ARMTargetStreamer {
private:
// This structure holds all attributes, accounting for
- // their string/numeric value, so we can later emmit them
+ // their string/numeric value, so we can later emit them
// in declaration order, keeping all in the same vector
struct AttributeItem {
enum {
@@ -254,7 +254,7 @@ private:
} Type;
unsigned Tag;
unsigned IntValue;
- StringRef StringValue;
+ std::string StringValue;
static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
// The conformance tag must be emitted first when serialised
@@ -507,14 +507,15 @@ public:
/// This is one of the functions used to emit data into an ELF section, so the
/// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
/// necessary.
- void EmitValueImpl(const MCExpr *Value, unsigned Size,
- const SMLoc &Loc) override {
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
- if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4))
- getContext().reportFatalError(Loc, "relocated expression must be 32-bit");
+ if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
+ getContext().reportError(Loc, "relocated expression must be 32-bit");
+ return;
+ }
EmitDataMappingSymbol();
- MCELFStreamer::EmitValueImpl(Value, Size);
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
}
void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
@@ -684,16 +685,16 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
using namespace ARMBuildAttrs;
setAttributeItem(CPU_name,
- ARMTargetParser::getCPUAttr(Arch),
+ ARM::getCPUAttr(Arch),
false);
if (EmittedArch == ARM::AK_INVALID)
setAttributeItem(CPU_arch,
- ARMTargetParser::getArchAttr(Arch),
+ ARM::getArchAttr(Arch),
false);
else
setAttributeItem(CPU_arch,
- ARMTargetParser::getArchAttr(EmittedArch),
+ ARM::getArchAttr(EmittedArch),
false);
switch (Arch) {
@@ -702,7 +703,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::AK_ARMV3:
case ARM::AK_ARMV3M:
case ARM::AK_ARMV4:
- case ARM::AK_ARMV5:
setAttributeItem(ARM_ISA_use, Allowed, false);
break;
@@ -710,7 +710,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::AK_ARMV5T:
case ARM::AK_ARMV5TE:
case ARM::AK_ARMV6:
- case ARM::AK_ARMV6J:
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, Allowed, false);
break;
@@ -721,8 +720,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
break;
case ARM::AK_ARMV6K:
- case ARM::AK_ARMV6Z:
- case ARM::AK_ARMV6ZK:
+ case ARM::AK_ARMV6KZ:
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, Allowed, false);
setAttributeItem(Virtualization_use, AllowTZ, false);
@@ -732,10 +730,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
setAttributeItem(THUMB_ISA_use, Allowed, false);
break;
- case ARM::AK_ARMV7:
- setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
- break;
-
case ARM::AK_ARMV7A:
setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
setAttributeItem(ARM_ISA_use, Allowed, false);
@@ -755,6 +749,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::AK_ARMV8A:
case ARM::AK_ARMV8_1A:
+ case ARM::AK_ARMV8_2A:
setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1084,19 +1079,14 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
}
inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
- SwitchToEHSection(".ARM.extab",
- ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC,
- SectionKind::getDataRel(),
- FnStart);
+ SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC,
+ SectionKind::getData(), FnStart);
}
inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
- SwitchToEHSection(".ARM.exidx",
- ELF::SHT_ARM_EXIDX,
+ SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX,
ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
- SectionKind::getDataRel(),
- FnStart);
+ SectionKind::getData(), FnStart);
}
void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
MCDataFragment *Frag = getOrCreateDataFragment();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 1ac08159bd3d..bda37f6616a8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -33,7 +33,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
SupportsDebugInformation = true;
// Exceptions handling
- ExceptionsType = ExceptionHandling::SjLj;
+ ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS()
+ ? ExceptionHandling::SjLj
+ : ExceptionHandling::DwarfCFI;
UseIntegratedAssembler = true;
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 99a5fff5ec27..5e548162bec6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -19,34 +19,37 @@
#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
- class Triple;
-
- class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
- virtual void anchor();
-
- public:
- explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
- };
-
- class ARMELFMCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit ARMELFMCAsmInfo(const Triple &TT);
-
- void setUseIntegratedAssembler(bool Value) override;
- };
-
- class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
- void anchor() override;
- public:
- explicit ARMCOFFMCAsmInfoMicrosoft();
- };
-
- class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
- void anchor() override;
- public:
- explicit ARMCOFFMCAsmInfoGNU();
- };
+class Triple;
+
+class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+};
+
+class ARMELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit ARMELFMCAsmInfo(const Triple &TT);
+
+ void setUseIntegratedAssembler(bool Value) override;
+};
+
+class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit ARMCOFFMCAsmInfoMicrosoft();
+};
+
+class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit ARMCOFFMCAsmInfoGNU();
+};
} // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index 9146d4def75a..75dde8008fca 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -63,8 +63,8 @@ public:
return false;
}
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override {
- return getSubExpr()->findAssociatedSection();
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
}
// There are no TLS ARMMCExprs at the moment.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 21c9fc1e58b2..8c8c249addb5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -24,6 +24,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -134,101 +135,11 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
bool isThumb =
TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb;
- bool NoCPU = CPU == "generic" || CPU.empty();
std::string ARMArchFeature;
- switch (TT.getSubArch()) {
- default:
- llvm_unreachable("invalid sub-architecture for ARM");
- case Triple::ARMSubArch_v8:
- if (NoCPU)
- // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
- // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
- // FeatureT2XtPk, FeatureCrypto, FeatureCRC
- ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
- "+trustzone,+t2xtpk,+crypto,+crc";
- else
- // Use CPU to figure out the exact features
- ARMArchFeature = "+v8";
- break;
- case Triple::ARMSubArch_v8_1a:
- if (NoCPU)
- // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
- // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
- // FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a
- ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
- "+trustzone,+t2xtpk,+crypto,+crc";
- else
- // Use CPU to figure out the exact features
- ARMArchFeature = "+v8.1a";
- break;
- case Triple::ARMSubArch_v7m:
- isThumb = true;
- if (NoCPU)
- // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
- ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
- else
- // Use CPU to figure out the exact features.
- ARMArchFeature = "+v7";
- break;
- case Triple::ARMSubArch_v7em:
- if (NoCPU)
- // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
- // FeatureT2XtPk, FeatureMClass
- ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,+t2xtpk,+mclass";
- else
- // Use CPU to figure out the exact features.
- ARMArchFeature = "+v7";
- break;
- case Triple::ARMSubArch_v7s:
- if (NoCPU)
- // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
- // Swift
- ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
- else
- // Use CPU to figure out the exact features.
- ARMArchFeature = "+v7";
- break;
- case Triple::ARMSubArch_v7:
- // v7 CPUs have lots of different feature sets. If no CPU is specified,
- // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
- // the "minimum" feature set and use CPU string to figure out the exact
- // features.
- if (NoCPU)
- // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
- ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
- else
- // Use CPU to figure out the exact features.
- ARMArchFeature = "+v7";
- break;
- case Triple::ARMSubArch_v6t2:
- ARMArchFeature = "+v6t2";
- break;
- case Triple::ARMSubArch_v6k:
- ARMArchFeature = "+v6k";
- break;
- case Triple::ARMSubArch_v6m:
- isThumb = true;
- if (NoCPU)
- // v6m: FeatureNoARM, FeatureMClass
- ARMArchFeature = "+v6m,+noarm,+mclass";
- else
- ARMArchFeature = "+v6";
- break;
- case Triple::ARMSubArch_v6:
- ARMArchFeature = "+v6";
- break;
- case Triple::ARMSubArch_v5te:
- ARMArchFeature = "+v5te";
- break;
- case Triple::ARMSubArch_v5:
- ARMArchFeature = "+v5t";
- break;
- case Triple::ARMSubArch_v4t:
- ARMArchFeature = "+v4t";
- break;
- case Triple::NoSubArch:
- break;
- }
+
+ unsigned ArchID = ARM::parseArch(TT.getArchName());
+ if (ArchID != ARM::AK_INVALID && (CPU.empty() || CPU == "generic"))
+ ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
if (isThumb) {
if (ARMArchFeature.empty())
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index fd30623d79af..c2bbc8e828c4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -86,7 +86,8 @@ MCAsmBackend *createThumbBEAsmBackend(const Target &T,
// object file.
MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter, bool RelaxAll);
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible);
/// Construct an ELF Mach-O object writer.
MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 95d7ea7c04a3..cfd504e533af 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -150,10 +150,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
// See <reloc.h>.
const MCSymbol *A = &Target.getSymA()->getSymbol();
- if (!A->getFragment())
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
"symbol '" + A->getName() +
"' can not be undefined in a subtraction expression");
+ return;
+ }
uint32_t Value = Writer->getSymbolAddress(*A, Layout);
uint32_t Value2 = 0;
@@ -163,10 +165,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
if (const MCSymbolRefExpr *B = Target.getSymB()) {
const MCSymbol *SB = &B->getSymbol();
- if (!SB->getFragment())
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
"symbol '" + B->getSymbol().getName() +
"' can not be undefined in a subtraction expression");
+ return;
+ }
// Select the appropriate difference relocation type.
Type = MachO::ARM_RELOC_HALF_SECTDIFF;
@@ -251,10 +255,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
// See <reloc.h>.
const MCSymbol *A = &Target.getSymA()->getSymbol();
- if (!A->getFragment())
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
"symbol '" + A->getName() +
"' can not be undefined in a subtraction expression");
+ return;
+ }
uint32_t Value = Writer->getSymbolAddress(*A, Layout);
uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -265,10 +271,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
const MCSymbol *SB = &B->getSymbol();
- if (!SB->getFragment())
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(Fixup.getLoc(),
"symbol '" + B->getSymbol().getName() +
"' can not be undefined in a subtraction expression");
+ return;
+ }
// Select the appropriate difference relocation type.
Type = MachO::ARM_RELOC_SECTDIFF;
@@ -346,13 +354,15 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
unsigned Log2Size;
unsigned RelocType = MachO::ARM_RELOC_VANILLA;
- if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
+ if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
// If we failed to get fixup kind info, it's because there's no legal
// relocation type for the fixup kind. This happens when it's a fixup that's
// expected to always be resolvable at assembly time and not have any
// relocations needed.
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported relocation on symbol");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation on symbol");
+ return;
+ }
// If this is a difference or a defined symbol plus an offset, then we need a
// scattered relocation entry. Differences always require scattered
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b680db5c3a78..dad50f2834ee 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -27,8 +27,8 @@ ARMTargetStreamer::~ARMTargetStreamer() {}
// The constant pool handling is shared by all ARMTargetStreamer
// implementations.
-const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
- return ConstantPools->addEntry(Streamer, Expr, 4);
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) {
+ return ConstantPools->addEntry(Streamer, Expr, 4, Loc);
}
void ARMTargetStreamer::emitCurrentConstantPool() {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index b993b1be4847..83fa084e60c7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -37,11 +37,11 @@ void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
}
}
-MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context,
- MCAsmBackend &MAB,
- raw_pwrite_stream &OS,
- MCCodeEmitter *Emitter,
- bool RelaxAll) {
- return new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+MCStreamer *llvm::createARMWinCOFFStreamer(
+ MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) {
+ auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
}
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 090a003424a4..5acb2d46f3e7 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -711,7 +711,6 @@ this in a target-independent way: we should probably fold that (when using
"undefined at zero" semantics) to set the "defined at zero" bit and have
the code generator expand out the right code.
-
//===---------------------------------------------------------------------===//
Clean up the test/MC/ARM files to have more robust register choices.
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 3b4358b5d9bf..93e0ac4aa320 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -13,6 +13,7 @@
#include "Thumb1FrameLowering.h"
#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -84,7 +85,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented");
MachineBasicBlock::iterator MBBI = MBB.begin();
MachineFrameInfo *MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -100,7 +100,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
assert(NumBytes >= ArgRegsSaveSize &&
"ArgRegsSaveSize is included in NumBytes");
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
unsigned FramePtr = RegInfo->getFrameRegister(MF);
unsigned BasePtr = RegInfo->getBaseRegister();
int CFAOffset = 0;
@@ -168,8 +172,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
++MBBI;
- if (MBBI != MBB.end())
- dl = MBBI->getDebugLoc();
}
// Determine starting offsets of spill areas.
@@ -232,11 +234,10 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
}
}
-
// Adjust FP so it point to the stack slot that contains the previous FP.
if (HasFP) {
- FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI)
- + GPRCS1Size + ArgRegsSaveSize;
+ FramePtrOffsetInBlock +=
+ MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
.addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
.setMIFlags(MachineInstr::FrameSetup));
@@ -321,11 +322,8 @@ static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- assert((MBBI->getOpcode() == ARM::tBX_RET ||
- MBBI->getOpcode() == ARM::tPOP_RET) &&
- "Can only insert epilog into returning blocks");
- DebugLoc dl = MBBI->getDebugLoc();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
MachineFrameInfo *MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
const ThumbRegisterInfo *RegInfo =
@@ -377,9 +375,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
ARM::SP)
.addReg(FramePtr));
} else {
- if (MBBI->getOpcode() == ARM::tBX_RET &&
- &MBB.front() != MBBI &&
- std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
+ &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
@@ -388,66 +385,189 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
}
}
- bool IsV4PopReturn = false;
- for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo())
+ if (needPopSpecialFixUp(MF)) {
+ bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true);
+ (void)Done;
+ assert(Done && "Emission of the special fixup failed!?");
+ }
+}
+
+bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ if (!needPopSpecialFixUp(*MBB.getParent()))
+ return true;
+
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+ return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false);
+}
+
+bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
+ ARMFunctionInfo *AFI =
+ const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>();
+ if (AFI->getArgRegsSaveSize())
+ return true;
+
+ // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
+ for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
if (CSI.getReg() == ARM::LR)
- IsV4PopReturn = true;
- IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps();
-
- // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
- // to LR, and we can't pop the value directly to the PC since
- // we need to update the SP after popping the value. So instead
- // we have to emit:
- // POP {r3}
- // ADD sp, #offset
- // BX r3
- // If this would clobber a return value, then generate this sequence instead:
- // MOV ip, r3
- // POP {r3}
- // ADD sp, #offset
- // MOV lr, r3
- // MOV r3, ip
- // BX lr
- if (ArgRegsSaveSize || IsV4PopReturn) {
- // Get the last instruction, tBX_RET
- MBBI = MBB.getLastNonDebugInstr();
- assert (MBBI->getOpcode() == ARM::tBX_RET);
- DebugLoc dl = MBBI->getDebugLoc();
-
- if (AFI->getReturnRegsCount() <= 3) {
- // Epilogue: pop saved LR to R3 and branch off it.
- AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
- .addReg(ARM::R3, RegState::Define);
-
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
-
- MachineInstrBuilder MIB =
- BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
- .addReg(ARM::R3, RegState::Kill);
- AddDefaultPred(MIB);
- MIB.copyImplicitOps(&*MBBI);
- // erase the old tBX_RET instruction
- MBB.erase(MBBI);
- } else {
- AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
- .addReg(ARM::R12, RegState::Define)
- .addReg(ARM::R3, RegState::Kill));
+ return true;
+
+ return false;
+}
+
+bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
+ bool DoIt) const {
+ MachineFunction &MF = *MBB.getParent();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const ThumbRegisterInfo *RegInfo =
+ static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
- AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
- .addReg(ARM::R3, RegState::Define);
+ // If MBBI is a return instruction, or is a tPOP followed by a return
+ // instruction in the successor BB, we may be able to directly restore
+ // LR in the PC.
+ // This is only possible with v5T ops (v4T can't change the Thumb bit via
+ // a POP PC instruction), and only if we do not need to emit any SP update.
+ // Otherwise, we need a temporary register to pop the value
+ // and copy that value into LR.
+ auto MBBI = MBB.getFirstTerminator();
+ bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+ if (CanRestoreDirectly) {
+ if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB)
+ CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+ MBBI->getOpcode() == ARM::tPOP_RET);
+ else {
+ auto MBBI_prev = MBBI;
+ MBBI_prev--;
+ assert(MBBI_prev->getOpcode() == ARM::tPOP);
+ assert(MBB.succ_size() == 1);
+ if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+ MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET.
+ else
+ CanRestoreDirectly = false;
+ }
+ }
- emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+ if (CanRestoreDirectly) {
+ if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
+ return true;
+ MachineInstrBuilder MIB =
+ AddDefaultPred(
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+ // Copy implicit ops and popped registers, if any.
+ for (auto MO: MBBI->operands())
+ if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
+ MIB.addOperand(MO);
+ MIB.addReg(ARM::PC, RegState::Define);
+ // Erase the old instruction (tBX_RET or tPOP).
+ MBB.erase(MBBI);
+ return true;
+ }
- AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
- .addReg(ARM::LR, RegState::Define)
- .addReg(ARM::R3, RegState::Kill));
+ // Look for a temporary register to use.
+ // First, compute the liveness information.
+ LivePhysRegs UsedRegs(STI.getRegisterInfo());
+ UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true);
+ // The semantic of pristines changed recently and now,
+ // the callee-saved registers that are touched in the function
+ // are not part of the pristines set anymore.
+ // Add those callee-saved now.
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+ const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ UsedRegs.addReg(CSRegs[i]);
+
+ DebugLoc dl = DebugLoc();
+ if (MBBI != MBB.end()) {
+ dl = MBBI->getDebugLoc();
+ auto InstUpToMBBI = MBB.end();
+ while (InstUpToMBBI != MBBI)
+ // The pre-decrement is on purpose here.
+ // We want to have the liveness right before MBBI.
+ UsedRegs.stepBackward(*--InstUpToMBBI);
+ }
- AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
- .addReg(ARM::R3, RegState::Define)
- .addReg(ARM::R12, RegState::Kill));
- // Keep the tBX_RET instruction
+ // Look for a register that can be directly use in the POP.
+ unsigned PopReg = 0;
+ // And some temporary register, just in case.
+ unsigned TemporaryReg = 0;
+ BitVector PopFriendly =
+ TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+ assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
+ // Rebuild the GPRs from the high registers because they are removed
+ // form the GPR reg class for thumb1.
+ BitVector GPRsNoLRSP =
+ TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+ GPRsNoLRSP |= PopFriendly;
+ GPRsNoLRSP.reset(ARM::LR);
+ GPRsNoLRSP.reset(ARM::SP);
+ GPRsNoLRSP.reset(ARM::PC);
+ for (int Register = GPRsNoLRSP.find_first(); Register != -1;
+ Register = GPRsNoLRSP.find_next(Register)) {
+ if (!UsedRegs.contains(Register)) {
+ // Remember the first pop-friendly register and exit.
+ if (PopFriendly.test(Register)) {
+ PopReg = Register;
+ TemporaryReg = 0;
+ break;
+ }
+ // Otherwise, remember that the register will be available to
+ // save a pop-friendly register.
+ TemporaryReg = Register;
}
}
+
+ if (!DoIt && !PopReg && !TemporaryReg)
+ return false;
+
+ assert((PopReg || TemporaryReg) && "Cannot get LR");
+
+ if (TemporaryReg) {
+ assert(!PopReg && "Unnecessary MOV is about to be inserted");
+ PopReg = PopFriendly.find_first();
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(TemporaryReg, RegState::Define)
+ .addReg(PopReg, RegState::Kill));
+ }
+
+ if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
+ // We couldn't use the direct restoration above, so
+ // perform the opposite conversion: tPOP_RET to tPOP.
+ MachineInstrBuilder MIB =
+ AddDefaultPred(
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+ bool Popped = false;
+ for (auto MO: MBBI->operands())
+ if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+ MO.getReg() != ARM::PC) {
+ MIB.addOperand(MO);
+ if (!MO.isImplicit())
+ Popped = true;
+ }
+ // Is there anything left to pop?
+ if (!Popped)
+ MBB.erase(MIB.getInstr());
+ // Erase the old instruction.
+ MBB.erase(MBBI);
+ MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+ }
+
+ assert(PopReg && "Do not know how to get LR");
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+ .addReg(PopReg, RegState::Define);
+
+ emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(ARM::LR, RegState::Define)
+ .addReg(PopReg, RegState::Kill));
+
+ if (TemporaryReg)
+ AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+ .addReg(PopReg, RegState::Define)
+ .addReg(TemporaryReg, RegState::Kill));
+
+ return true;
}
bool Thumb1FrameLowering::
@@ -461,8 +581,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
DebugLoc DL;
const TargetInstrInfo &TII = *STI.getInstrInfo();
- if (MI != MBB.end()) DL = MI->getDebugLoc();
-
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
AddDefaultPred(MIB);
for (unsigned i = CSI.size(); i != 0; --i) {
@@ -501,31 +619,38 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
const TargetInstrInfo &TII = *STI.getInstrInfo();
bool isVarArg = AFI->getArgRegsSaveSize() > 0;
- DebugLoc DL = MI->getDebugLoc();
+ DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
AddDefaultPred(MIB);
- bool NumRegs = false;
+ bool NeedsPop = false;
for (unsigned i = CSI.size(); i != 0; --i) {
unsigned Reg = CSI[i-1].getReg();
if (Reg == ARM::LR) {
- // Special epilogue for vararg functions. See emitEpilogue
- if (isVarArg)
- continue;
- // ARMv4T requires BX, see emitEpilogue
- if (STI.hasV4TOps() && !STI.hasV5TOps())
+ if (MBB.succ_empty()) {
+ // Special epilogue for vararg functions. See emitEpilogue
+ if (isVarArg)
+ continue;
+ // ARMv4T requires BX, see emitEpilogue
+ if (!STI.hasV5TOps())
+ continue;
+ Reg = ARM::PC;
+ (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+ if (MI != MBB.end())
+ MIB.copyImplicitOps(&*MI);
+ MI = MBB.erase(MI);
+ } else
+ // LR may only be popped into PC, as part of return sequence.
+ // If this isn't the return sequence, we'll need emitPopSpecialFixUp
+ // to restore LR the hard way.
continue;
- Reg = ARM::PC;
- (*MIB).setDesc(TII.get(ARM::tPOP_RET));
- MIB.copyImplicitOps(&*MI);
- MI = MBB.erase(MI);
}
MIB.addReg(Reg, getDefRegState(true));
- NumRegs = true;
+ NeedsPop = true;
}
// It's illegal to emit pop instruction without operands.
- if (NumRegs)
+ if (NeedsPop)
MBB.insert(MI, &*MIB);
else
MF.DeleteMachineInstr(MIB);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index 31d57325ebd6..812f9830824d 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -45,6 +45,42 @@ public:
eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+
+ /// Check whether or not the given \p MBB can be used as a epilogue
+ /// for the target.
+ /// The epilogue will be inserted before the first terminator of that block.
+ /// This method is used by the shrink-wrapping pass to decide if
+ /// \p MBB will be correctly handled by the target.
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+private:
+ /// Check if the frame lowering of \p MF needs a special fixup
+ /// code sequence for the epilogue.
+ /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+ /// to LR, and we can't pop the value directly to the PC when
+ /// we need to update the SP after popping the value. So instead
+ /// we have to emit:
+ /// POP {r3}
+ /// ADD sp, #offset
+ /// BX r3
+ /// If this would clobber a return value, then generate this sequence instead:
+ /// MOV ip, r3
+ /// POP {r3}
+ /// ADD sp, #offset
+ /// MOV lr, r3
+ /// MOV r3, ip
+ /// BX lr
+ bool needPopSpecialFixUp(const MachineFunction &MF) const;
+
+ /// Emit the special fixup code sequence for the epilogue.
+ /// \see needPopSpecialFixUp for more details.
+ /// \p DoIt, tells this method whether or not to actually insert
+ /// the code sequence in \p MBB. I.e., when \p DoIt is false,
+ /// \p MBB is left untouched.
+ /// \returns For \p DoIt == true: True when the emission succeeded
+ /// false otherwise. For \p DoIt == false: True when the emission
+ /// would have been possible, false otherwise.
+ bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const;
};
} // End llvm namespace
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 216e776932be..530e1d33839a 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -84,11 +84,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI).addImm(0).addMemOperand(MMO));
@@ -112,11 +110,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
.addFrameIndex(FI).addImm(0).addMemOperand(MMO));
}
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 68736bc1decd..bf0498dfda69 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -256,8 +256,8 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
// Finalize the bundle.
- MachineBasicBlock::instr_iterator LI = LastITMI;
- finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI));
+ finalizeBundle(MBB, InsertPos.getInstrIterator(),
+ ++LastITMI->getIterator());
Modified = true;
++NumITs;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index dc74f4e38ff8..4da769f23280 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -131,11 +131,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass ||
RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
@@ -171,11 +169,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index d9ab824995c1..bcd0e5751258 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -125,7 +125,10 @@ namespace {
{ ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
{ ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 },
{ ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 },
- // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
+ // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
+ // tSTMIA_UPD is a change in semantics which can only be used if the base
+ // register is killed. This difference is correctly handled elsewhere.
+ { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
{ ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 },
{ ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 }
};
@@ -210,12 +213,12 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
unsigned FromOpc = ReduceTable[i].WideOpc;
if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
- assert(false && "Duplicated entries?");
+ llvm_unreachable("Duplicated entries?");
}
}
static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
- for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+ for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
if (*Regs == ARM::CPSR)
return true;
return false;
@@ -435,6 +438,14 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
isLdStMul = true;
break;
}
+ case ARM::t2STMIA: {
+ // If the base register is killed, we don't care what its value is after the
+ // instruction, so we can use an updating STMIA.
+ if (!MI->getOperand(0).isKill())
+ return false;
+
+ break;
+ }
case ARM::t2LDMIA_RET: {
unsigned BaseReg = MI->getOperand(1).getReg();
if (BaseReg != ARM::SP)
@@ -492,6 +503,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
// Add the 16-bit load / store instruction.
DebugLoc dl = MI->getDebugLoc();
MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+
+ // tSTMIA_UPD takes a defining register operand. We've already checked that
+ // the register is killed, so mark it as dead here.
+ if (Entry.WideOpc == ARM::t2STMIA)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
+
if (!isLdStMul) {
MIB.addOperand(MI->getOperand(0));
MIB.addOperand(MI->getOperand(1));
@@ -633,10 +650,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
return false;
- if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
- STI->avoidMOVsShifterOperand())
+ if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
// Don't issue movs with shifter operand for some CPUs unless we
- // are optimizing / minimizing for size.
+ // are optimizing for size.
return false;
unsigned Reg0 = MI->getOperand(0).getReg();
@@ -660,11 +676,13 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
}
} else if (Reg0 != Reg1) {
// Try to commute the operands to make it a 2-address instruction.
- unsigned CommOpIdx1, CommOpIdx2;
+ unsigned CommOpIdx1 = 1;
+ unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
- CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+ MI->getOperand(CommOpIdx2).getReg() != Reg0)
return false;
- MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+ MachineInstr *CommutedMI =
+ TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2);
if (!CommutedMI)
return false;
}
@@ -750,10 +768,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
return false;
- if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
- STI->avoidMOVsShifterOperand())
+ if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
// Don't issue movs with shifter operand for some CPUs unless we
- // are optimizing / minimizing for size.
+ // are optimizing for size.
return false;
unsigned Limit = ~0U;
@@ -1012,9 +1029,9 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
- // Optimizing / minimizing size?
- OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
- MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ // Optimizing / minimizing size? Minimizing size implies optimizing for size.
+ OptimizeSize = MF.getFunction()->optForSize();
+ MinimizeSize = MF.getFunction()->optForMinSize();
BlockInfo.clear();
BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/lib/Target/AVR/AVR.td b/lib/Target/AVR/AVR.td
new file mode 100644
index 000000000000..9e80717cd680
--- /dev/null
+++ b/lib/Target/AVR/AVR.td
@@ -0,0 +1,563 @@
+//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+// This is the top level entry point for the AVR target.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===---------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===---------------------------------------------------------------------===//
+// AVR Subtarget Features.
+//===---------------------------------------------------------------------===//
+
+// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details
+// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD.
+// In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
+// avr2 (with SRAM) adds the rest of the variants.
+// :TODO: s/AVRTiny/Tiny
+
+
+// A feature set aggregates features, grouping them. We don't want to create a
+// new member in AVRSubtarget (to store a value) for each set because we do not
+// care if the set is supported, only the subfeatures inside the set. We fix
+// this by simply setting the same dummy member for all feature sets, which is
+// then ignored.
+class FeatureSet<string name, string desc, list<SubtargetFeature> i>
+ : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
+
+// A family of microcontrollers, defining a set of supported features.
+class Family<string name, list<SubtargetFeature> i>
+ : FeatureSet<name, !strconcat("The device is a part of the ",
+ name, " family"), i>;
+
+// The device has SRAM, and supports the bare minimum of
+// SRAM-relevant instructions.
+//
+// These are:
+// LD - all 9 variants
+// ST - all 9 variants
+// LDD - two variants for Y and Z
+// STD - two variants for Y and Z
+// `LDS Rd, K`
+// `STS k, Rr`
+// `PUSH`/`POP`
+def FeatureSRAM : SubtargetFeature<"sram", "m_hasSRAM", "true",
+ "The device has random access memory">;
+
+// The device supports the `JMP k` and `CALL k` instructions.
+def FeatureJMPCALL : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
+ "The device supports the `JMP` and "
+ "`CALL` instructions">;
+
+
+// The device supports the indirect branches `IJMP` and `ICALL`.
+def FeatureIJMPCALL : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL",
+ "true",
+ "The device supports `IJMP`/`ICALL`"
+ "instructions">;
+
+// The device supports the extended indirect branches `EIJMP` and `EICALL`.
+def FeatureEIJMPCALL : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL",
+ "true", "The device supports the "
+ "`EIJMP`/`EICALL` instructions">;
+
+// The device supports `ADDI Rd, K`, `SUBI Rd, K`.
+def FeatureADDSUBIW : SubtargetFeature<"addsubiw", "m_hasADDSUBIW",
+ "true", "Enable 16-bit register-immediate "
+ "addition and subtraction instructions">;
+
+// The device has an 8-bit stack pointer (SP) register.
+def FeatureSmallStack : SubtargetFeature<"smallstack", "m_hasSmallStack",
+ "true", "The device has an 8-bit "
+ "stack pointer">;
+
+// The device supports the 16-bit GPR pair MOVW instruction.
+def FeatureMOVW : SubtargetFeature<"movw", "m_hasMOVW", "true",
+ "The device supports the 16-bit MOVW "
+ "instruction">;
+
+// The device supports the `LPM` instruction, with implied destination being r0.
+def FeatureLPM : SubtargetFeature<"lpm", "m_hasLPM", "true",
+ "The device supports the `LPM` instruction">;
+
+// The device supports the `LPM Rd, Z[+] instruction.
+def FeatureLPMX : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
+ "The device supports the `LPM Rd, Z[+]` "
+ "instruction">;
+
+// The device supports the `ELPM` instruction.
+def FeatureELPM : SubtargetFeature<"elpm", "m_hasELPM", "true",
+ "The device supports the ELPM instruction">;
+
+// The device supports the `ELPM Rd, Z[+]` instructions.
+def FeatureELPMX : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
+ "The device supports the `ELPM Rd, Z[+]` "
+ "instructions">;
+
+// The device supports the `SPM` instruction.
+def FeatureSPM : SubtargetFeature<"spm", "m_hasSPM", "true",
+ "The device supports the `SPM` instruction">;
+
+// The device supports the `SPM Z+` instruction.
+def FeatureSPMX : SubtargetFeature<"spmx", "m_hasSPMX", "true",
+ "The device supports the `SPM Z+` "
+ "instruction">;
+
+// The device supports the `DES k` instruction.
+def FeatureDES : SubtargetFeature<"des", "m_hasDES", "true",
+ "The device supports the `DES k` encryption "
+ "instruction">;
+
+// The device supports the Read-Write-Modify instructions
+// XCH, LAS, LAC, and LAT.
+def FeatureRMW : SubtargetFeature<"rmw", "m_supportsRMW", "true",
+ "The device supports the read-write-modify "
+ "instructions: XCH, LAS, LAC, LAT">;
+
+// The device supports the `[F]MUL[S][U]` family of instructions.
+def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication",
+ "true", "The device supports the "
+ "multiplication instructions">;
+
+// The device supports the `BREAK` instruction.
+def FeatureBREAK : SubtargetFeature<"break", "m_hasBREAK", "true",
+ "The device supports the `BREAK` debugging "
+ "instruction">;
+
+// The device has instruction encodings specific to the Tiny core.
+def FeatureTinyEncoding : SubtargetFeature<"tinyencoding",
+ "m_hasTinyEncoding", "true",
+ "The device has Tiny core specific "
+ "instruction encodings">;
+
+class ELFArch<string name> : SubtargetFeature<"", "ELFArch",
+ !strconcat("ELF::",name), "">;
+
+// ELF e_flags architecture values
+def ELFArchAVR1 : ELFArch<"EF_AVR_ARCH_AVR1">;
+def ELFArchAVR2 : ELFArch<"EF_AVR_ARCH_AVR2">;
+def ELFArchAVR25 : ELFArch<"EF_AVR_ARCH_AVR25">;
+def ELFArchAVR3 : ELFArch<"EF_AVR_ARCH_AVR3">;
+def ELFArchAVR31 : ELFArch<"EF_AVR_ARCH_AVR31">;
+def ELFArchAVR35 : ELFArch<"EF_AVR_ARCH_AVR35">;
+def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">;
+def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">;
+def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">;
+def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">;
+def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">;
+def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">;
+def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">;
+def ELFArchXMEGA4 : ELFArch<"EF_AVR_ARCH_XMEGA4">;
+def ELFArchXMEGA5 : ELFArch<"EF_AVR_ARCH_XMEGA5">;
+def ELFArchXMEGA6 : ELFArch<"EF_AVR_ARCH_XMEGA6">;
+def ELFArchXMEGA7 : ELFArch<"EF_AVR_ARCH_XMEGA7">;
+
+//===---------------------------------------------------------------------===//
+// AVR Families
+//===---------------------------------------------------------------------===//
+
+// The device has at least the bare minimum that **every** single AVR
+// device should have.
+def FamilyAVR0 : Family<"avr0", []>;
+
+def FamilyAVR1 : Family<"avr1", [FamilyAVR0, FeatureLPM]>;
+
+def FamilyAVR2 : Family<"avr2",
+ [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
+ FeatureSRAM]>;
+
+def FamilyAVR25 : Family<"avr25",
+ [FamilyAVR2, FeatureMOVW, FeatureLPMX,
+ FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR3 : Family<"avr3",
+ [FamilyAVR2, FeatureJMPCALL]>;
+
+def FamilyAVR31 : Family<"avr31",
+ [FamilyAVR3, FeatureELPM]>;
+
+def FamilyAVR35 : Family<"avr35",
+ [FamilyAVR3, FeatureMOVW, FeatureLPMX,
+ FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR4 : Family<"avr4",
+ [FamilyAVR2, FeatureMultiplication,
+ FeatureMOVW, FeatureLPMX, FeatureSPM,
+ FeatureBREAK]>;
+
+def FamilyAVR5 : Family<"avr5",
+ [FamilyAVR3, FeatureMultiplication,
+ FeatureMOVW, FeatureLPMX, FeatureSPM,
+ FeatureBREAK]>;
+
+def FamilyAVR51 : Family<"avr51",
+ [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
+
+def FamilyAVR6 : Family<"avr6",
+ [FamilyAVR51]>;
+
+def FamilyAVRTiny : Family<"avrtiny",
+ [FamilyAVR0, FeatureBREAK, FeatureSRAM,
+ FeatureTinyEncoding]>;
+
+def FamilyXMEGA : Family<"xmega",
+ [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX,
+ FeatureDES]>;
+
+def FamilyXMEGAU : Family<"xmegau",
+ [FamilyXMEGA, FeatureRMW]>;
+
+def FeatureSetSpecial : FeatureSet<"special",
+ "Enable use of the entire instruction "
+ "set - used for debugging",
+ [FeatureSRAM, FeatureJMPCALL,
+ FeatureIJMPCALL, FeatureEIJMPCALL,
+ FeatureADDSUBIW, FeatureMOVW,
+ FeatureLPM, FeatureLPMX, FeatureELPM,
+ FeatureELPMX, FeatureSPM, FeatureSPMX,
+ FeatureDES, FeatureRMW,
+ FeatureMultiplication, FeatureBREAK]>;
+
+//===---------------------------------------------------------------------===//
+// AVR microcontrollers supported.
+//===---------------------------------------------------------------------===//
+
+class Device<string Name, Family Fam, ELFArch Arch,
+ list<SubtargetFeature> ExtraFeatures = []>
+ : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>;
+
+// Generic MCUs
+// Note that several versions of GCC has strange ELF architecture
+// settings for backwards compatibility - see `gas/config/tc-avr.c`
+// in AVR binutils. We do not replicate this.
+def : Device<"avr1", FamilyAVR1, ELFArchAVR1>;
+def : Device<"avr2", FamilyAVR2, ELFArchAVR2>;
+def : Device<"avr25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"avr3", FamilyAVR3, ELFArchAVR3>;
+def : Device<"avr31", FamilyAVR31, ELFArchAVR31>;
+def : Device<"avr35", FamilyAVR35, ELFArchAVR35>;
+def : Device<"avr4", FamilyAVR4, ELFArchAVR4>;
+def : Device<"avr5", FamilyAVR5, ELFArchAVR5>;
+def : Device<"avr51", FamilyAVR51, ELFArchAVR51>;
+def : Device<"avr6", FamilyAVR6, ELFArchAVR6>;
+def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>;
+def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"avrtiny", FamilyAVRTiny, ELFArchAVRTiny>;
+
+// Specific MCUs
+def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>;
+def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>;
+def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25,
+ [FeatureMOVW, FeatureLPMX]>;
+def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>;
+def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>;
+def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>;
+def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>;
+def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>;
+def : Device<"at43usb320", FamilyAVR31, ELFArchAVR31>;
+def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8", FamilyAVR4, ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega161", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega162", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega163", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega164a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega406", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega640", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm216", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm316", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u6", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb646", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb647", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90scr100", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at94k", FamilyAVR3, ELFArchAVR5,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>;
+def : Device<"m3000", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128a", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1280", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1281", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284p", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfa1", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90can128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1286", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1287", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega2560", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2561", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega256rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2564rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atxmega16a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega8e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b1", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64c3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64d3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64d4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a1", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"atxmega64a1u", FamilyXMEGAU, ELFArchXMEGA5>;
+def : Device<"atxmega128a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b1", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128d4", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256a3b", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3bu", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega384c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"attiny4", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny5", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny9", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny10", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny20", FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny40", FamilyAVRTiny, ELFArchAVRTiny>;
+
+//===---------------------------------------------------------------------===//
+// Register File Description
+//===---------------------------------------------------------------------===//
+
+include "AVRRegisterInfo.td"
+
+//===---------------------------------------------------------------------===//
+// Instruction Descriptions
+//===---------------------------------------------------------------------===//
+
+//include "AVRInstrInfo.td"
+
+//def AVRInstrInfo : InstrInfo;
+
+//===---------------------------------------------------------------------===//
+// Calling Conventions
+//===---------------------------------------------------------------------===//
+
+include "AVRCallingConv.td"
+
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+// def AVRAsmWriter : AsmWriter {
+// string AsmWriterClassName = "InstPrinter";
+// bit isMCAsmWriter = 1;
+// }
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+// def AVRAsmParser : AsmParser {
+// let ShouldEmitMatchRegisterName = 1;
+// let ShouldEmitMatchRegisterAltName = 1;
+// }
+
+// def AVRAsmParserVariant : AsmParserVariant {
+// int Variant = 0;
+//
+// // Recognize hard coded registers.
+// string RegisterPrefix = "$";
+// }
+
+//===---------------------------------------------------------------------===//
+// Target Declaration
+//===---------------------------------------------------------------------===//
+
+def AVR : Target {
+// let InstructionSet = AVRInstrInfo;
+// let AssemblyWriters = [AVRAsmWriter];
+//
+// let AssemblyParsers = [AVRAsmParser];
+// let AssemblyParserVariants = [AVRAsmParserVariant];
+}
+
diff --git a/lib/Target/AVR/AVRCallingConv.td b/lib/Target/AVR/AVRCallingConv.td
new file mode 100644
index 000000000000..d8cb3fe84022
--- /dev/null
+++ b/lib/Target/AVR/AVRCallingConv.td
@@ -0,0 +1,65 @@
+//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AVR architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AVR Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_AVR : CallingConv
+<[
+ // i8 is returned in R24.
+ CCIfType<[i8], CCAssignToReg<[R24]>>,
+
+ // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
+ CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
+]>;
+
+// Special return value calling convention for runtime functions.
+def RetCC_AVR_RT : CallingConv
+<[
+ CCIfType<[i8], CCAssignToReg<[R24,R25]>>,
+ CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// The calling conventions are implemented in custom C++ code
+
+// Calling convention for variadic functions.
+def ArgCC_AVR_Vararg : CallingConv
+<[
+ // i16 are always passed through the stack with an alignment of 1.
+ CCAssignToStack<2, 1>
+]>;
+
+// Special argument calling convention for
+// multiplication runtime functions.
+def ArgCC_AVR_RT_MUL : CallingConv
+<[
+ CCIfType<[i16], CCAssignToReg<[R27R26,R19R18]>>
+]>;
+
+// Special argument calling convention for
+// division runtime functions.
+def ArgCC_AVR_RT_DIV : CallingConv
+<[
+ CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
+ CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
+def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>;
diff --git a/lib/Target/AVR/AVRConfig.h b/lib/Target/AVR/AVRConfig.h
new file mode 100644
index 000000000000..65588bc50840
--- /dev/null
+++ b/lib/Target/AVR/AVRConfig.h
@@ -0,0 +1,15 @@
+//===-- AVRConfig.h - AVR Backend Configuration Header ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_CONFIG_H
+#define LLVM_AVR_CONFIG_H
+
+#define LLVM_AVR_GCC_COMPAT
+
+#endif // LLVM_AVR_CONFIG_H
diff --git a/lib/Target/AVR/AVRMachineFunctionInfo.h b/lib/Target/AVR/AVRMachineFunctionInfo.h
new file mode 100644
index 000000000000..6571d5d3e603
--- /dev/null
+++ b/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -0,0 +1,73 @@
+//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AVR-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H
+#define LLVM_AVR_MACHINE_FUNCTION_INFO_H
+
+#include "AVRConfig.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/**
+ * Contains AVR-specific information for each MachineFunction.
+ */
+class AVRMachineFunctionInfo : public MachineFunctionInfo {
+ /// Indicates if a register has been spilled by the register
+ /// allocator.
+ bool HasSpills;
+
+ /// Indicates if there are any fixed size allocas present.
+ /// Note that if there are only variable sized allocas this is set to false.
+ bool HasAllocas;
+
+ /// Indicates if arguments passed using the stack are being
+ /// used inside the function.
+ bool HasStackArgs;
+
+ /// Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+
+public:
+ AVRMachineFunctionInfo()
+ : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+ CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+ explicit AVRMachineFunctionInfo(MachineFunction &MF)
+ : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+ CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+ bool getHasSpills() const { return HasSpills; }
+ void setHasSpills(bool B) { HasSpills = B; }
+
+ bool getHasAllocas() const { return HasAllocas; }
+ void setHasAllocas(bool B) { HasAllocas = B; }
+
+ bool getHasStackArgs() const { return HasStackArgs; }
+ void setHasStackArgs(bool B) { HasStackArgs = B; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+};
+
+} // end llvm namespace
+
+#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td
new file mode 100644
index 000000000000..32650fc66751
--- /dev/null
+++ b/lib/Target/AVR/AVRRegisterInfo.td
@@ -0,0 +1,216 @@
+//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the AVR register file
+//===----------------------------------------------------------------------===//
+
+// 8-bit General purpose register definition.
+class AVRReg<bits<16> num,
+ string name,
+ list<Register> subregs = [],
+ list<string> altNames = []>
+ : RegisterWithSubRegs<name, subregs>
+{
+ field bits<16> Num = num;
+
+ let HWEncoding = num;
+ let Namespace = "AVR";
+ let SubRegs = subregs;
+ let AltNames = altNames;
+}
+
+// Subregister indices.
+let Namespace = "AVR" in
+{
+ def sub_lo : SubRegIndex<8>;
+ def sub_hi : SubRegIndex<8, 8>;
+}
+
+let Namespace = "AVR" in {
+ def ptr : RegAltNameIndex;
+}
+
+
+//===----------------------------------------------------------------------===//
+// 8-bit general purpose registers
+//===----------------------------------------------------------------------===//
+
+def R0 : AVRReg<0, "r0">, DwarfRegNum<[0]>;
+def R1 : AVRReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AVRReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AVRReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AVRReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AVRReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AVRReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AVRReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AVRReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AVRReg<9, "r9">, DwarfRegNum<[9]>;
+def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>;
+def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>;
+def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>;
+def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>;
+def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>;
+def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>;
+def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>;
+def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>;
+def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>;
+def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>;
+def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>;
+def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>;
+def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
+def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
+
+let SubRegIndices = [sub_lo, sub_hi],
+CoveredBySubRegs = 1 in
+{
+ // 16 bit GPR pairs.
+ def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
+
+ // The pointer registers (X,Y,Z) are a special case because they
+ // are printed as a `high:low` pair when a DREG is expected,
+ // but printed using `X`, `Y`, `Z` when a pointer register is expected.
+ let RegAltNameIndices = [ptr] in {
+ def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+ def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+ def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+ }
+ def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+ def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+ def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+ def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+ def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+ def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+ def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+ def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+ def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
+ def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
+ def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
+ def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
+ def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+//:TODO: use proper set instructions instead of using always "add"
+
+// Main 8-bit register class.
+def GPR8 : RegisterClass<"AVR", [i8], 8,
+ (
+ // Return value and argument registers.
+ add R24, R25, R18, R19, R20, R21, R22, R23,
+ // Scratch registers.
+ R30, R31, R26, R27,
+ // Callee saved registers.
+ R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+ R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+ )>;
+
+// Simple lower registers r0..r15
+def GPR8lo : RegisterClass<"AVR", [i8], 8,
+ (
+ add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+ )>;
+
+// 8-bit register class for instructions which take immediates.
+def LD8 : RegisterClass<"AVR", [i8], 8,
+ (
+ // Return value and arguments.
+ add R24, R25, R18, R19, R20, R21, R22, R23,
+ // Scratch registers.
+ R30, R31, R26, R27,
+ // Callee saved registers.
+ R28, R29, R17, R16
+ )>;
+
+// Simple lower registers r16..r23
+def LD8lo : RegisterClass<"AVR", [i8], 8,
+ (
+ add R23, R22, R21, R20, R19, R18, R17, R16
+ )>;
+
+// Main 16-bit pair register class.
+def DREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24, R19R18, R21R20, R23R22,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28, R17R16, R15R14, R13R12, R11R10,
+ R9R8, R7R6, R5R4, R3R2, R1R0
+ )>;
+
+// 16-bit register class for immediate instructions.
+def DLDREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24, R19R18, R21R20, R23R22,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28, R17R16
+ )>;
+
+// 16-bit register class for the adiw/sbiw instructions.
+def IWREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ // Return value and arguments.
+ add R25R24,
+ // Scratch registers.
+ R31R30, R27R26,
+ // Callee saved registers.
+ R29R28
+ )>;
+
+// 16-bit register class for the ld and st instructions.
+// AKA X,Y, and Z
+def PTRREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ add R27R26, // X
+ R29R28, // Y
+ R31R30 // Z
+ ), ptr>;
+
+// 16-bit register class for the ldd and std instructions.
+// AKA Y and Z.
+def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
+ (
+ add R31R30, R29R28
+ ), ptr>;
+
+// We have a bunch of instructions with an explicit Z register argument. We
+// model this using a register class containing only the Z register.
+// :TODO: Rename to 'ZREG'.
+def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
+
+// Register class used for the stack read pseudo instruction.
+def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
+
+//:TODO: if we remove this we get an error in tablegen
+//:TODO: this is just a hack, remove it once add16 works!
+// Status register.
+def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
+def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
+{
+ let CopyCost = -1; // Don't allow copying of status registers
+}
+
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
new file mode 100644
index 000000000000..a91dce8a63f4
--- /dev/null
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -0,0 +1,4 @@
+
+extern "C" void LLVMInitializeAVRTarget() {
+
+}
diff --git a/lib/Target/AVR/CMakeLists.txt b/lib/Target/AVR/CMakeLists.txt
new file mode 100644
index 000000000000..22b30ef35855
--- /dev/null
+++ b/lib/Target/AVR/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_TARGET_DEFINITIONS AVR.td)
+
+tablegen(LLVM AVRGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AVRGenCallingConv.inc -gen-callingconv)
+add_public_tablegen_target(AVRCommonTableGen)
+
+add_llvm_target(AVRCodeGen
+ AVRTargetMachine.cpp
+ )
+
+add_dependencies(LLVMAVRCodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+
diff --git a/lib/Target/AVR/LLVMBuild.txt b/lib/Target/AVR/LLVMBuild.txt
new file mode 100644
index 000000000000..386c594b20ec
--- /dev/null
+++ b/lib/Target/AVR/LLVMBuild.txt
@@ -0,0 +1,33 @@
+;===- ./lib/Target/AVR/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = TargetInfo
+
+[component_0]
+type = TargetGroup
+name = AVR
+parent = Target
+has_asmprinter = 0
+has_asmparser = 0
+
+[component_1]
+type = Library
+name = AVRCodeGen
+parent = AVR
+required_libraries = AsmPrinter CodeGen Core MC AVRInfo SelectionDAG Support Target
+add_to_library_groups = AVR
diff --git a/lib/Target/AVR/Makefile b/lib/Target/AVR/Makefile
new file mode 100644
index 000000000000..c91b6f5c0ae9
--- /dev/null
+++ b/lib/Target/AVR/Makefile
@@ -0,0 +1,19 @@
+##===- lib/Target/AVR/Makefile -----------------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMAVRCodeGen
+TARGET = AVR
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AVRGenRegisterInfo.inc
+
+DIRS = TargetInfo
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
new file mode 100644
index 000000000000..c0e0d20029c2
--- /dev/null
+++ b/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -0,0 +1,25 @@
+//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+Target TheAVRTarget;
+}
+
+extern "C" void LLVMInitializeAVRTargetInfo() {
+ llvm::RegisterTarget<llvm::Triple::avr> X(
+ llvm::TheAVRTarget, "avr", "Atmel AVR Microcontroller");
+}
+
+// FIXME: Temporary stub - this function must be defined for linking
+// to succeed. Remove once this function is properly implemented.
+extern "C" void LLVMInitializeAVRTargetMC() {
+}
diff --git a/lib/Target/AVR/TargetInfo/CMakeLists.txt b/lib/Target/AVR/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..f27090037702
--- /dev/null
+++ b/lib/Target/AVR/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/..
+ ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAVRInfo
+ AVRTargetInfo.cpp
+)
+
diff --git a/lib/Target/AVR/TargetInfo/LLVMBuild.txt b/lib/Target/AVR/TargetInfo/LLVMBuild.txt
new file mode 100644
index 000000000000..bc6e0ad2ee19
--- /dev/null
+++ b/lib/Target/AVR/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AVR/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AVRInfo
+parent = AVR
+required_libraries = MC Support
+add_to_library_groups = AVR \ No newline at end of file
diff --git a/lib/Target/AVR/TargetInfo/Makefile b/lib/Target/AVR/TargetInfo/Makefile
new file mode 100644
index 000000000000..92b483dd028b
--- /dev/null
+++ b/lib/Target/AVR/TargetInfo/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AVR/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAVRInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/BPF/BPF.td b/lib/Target/BPF/BPF.td
index a4ce90af0439..8493b0fd1e4a 100644
--- a/lib/Target/BPF/BPF.td
+++ b/lib/Target/BPF/BPF.td
@@ -25,7 +25,14 @@ def BPFInstPrinter : AsmWriter {
bit isMCAsmWriter = 1;
}
+def BPFAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string Name = "BPF";
+ string BreakCharacters = ".";
+}
+
def BPF : Target {
let InstructionSet = BPFInstrInfo;
let AssemblyWriters = [BPFInstPrinter];
+ let AssemblyParserVariants = [BPFAsmParserVariant];
}
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 73418283d9bf..6a5b37e153d8 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -547,8 +547,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// to set, the condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator I = BB;
- ++I;
+ MachineFunction::iterator I = ++BB->getIterator();
// ThisMBB:
// ...
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
index adcaff686933..4276d0858c2e 100644
--- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
+++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
@@ -17,8 +17,6 @@
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
-class MCOperand;
-
class BPFInstPrinter : public MCInstPrinter {
public:
BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 36f99262ed70..8c358cab62e8 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -68,16 +68,23 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
assert(Value == 0);
- return;
- }
- assert(Fixup.getKind() == FK_PCRel_2);
- Value = (uint16_t)((Value - 8) / 8);
- if (IsLittleEndian) {
- Data[Fixup.getOffset() + 2] = Value & 0xFF;
- Data[Fixup.getOffset() + 3] = Value >> 8;
+ } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
+ unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
+
+ for (unsigned i = 0; i != Size; ++i) {
+ unsigned Idx = IsLittleEndian ? i : Size - i;
+ Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
+ }
} else {
- Data[Fixup.getOffset() + 2] = Value >> 8;
- Data[Fixup.getOffset() + 3] = Value & 0xFF;
+ assert(Fixup.getKind() == FK_PCRel_2);
+ Value = (uint16_t)((Value - 8) / 8);
+ if (IsLittleEndian) {
+ Data[Fixup.getOffset() + 2] = Value & 0xFF;
+ Data[Fixup.getOffset() + 3] = Value >> 8;
+ } else {
+ Data[Fixup.getOffset() + 2] = Value >> 8;
+ Data[Fixup.getOffset() + 3] = Value & 0xFF;
+ }
}
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 05ba6183e322..87cdd5eb9dad 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -44,6 +44,10 @@ unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target,
return ELF::R_X86_64_64;
case FK_SecRel_4:
return ELF::R_X86_64_PC32;
+ case FK_Data_8:
+ return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+ case FK_Data_4:
+ return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
}
}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index d63bbf49294e..1f440fe87871 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -34,6 +34,8 @@ public:
UsesELFSectionDirectiveForBSS = true;
HasSingleParameterDotFile = false;
HasDotTypeDotSizeDirective = false;
+
+ SupportsDebugInformation = true;
}
};
}
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 272688edb8a1..5ea6551ebc9c 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -551,7 +551,8 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
void CppWriter::printType(Type* Ty) {
// We don't print definitions for primitive types
if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() ||
- Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy())
+ Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy() ||
+ Ty->isTokenTy())
return;
// If we already defined this type, we don't need to define it again.
@@ -1355,23 +1356,18 @@ void CppWriter::printInstruction(const Instruction *I,
}
case Instruction::GetElementPtr: {
const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
- if (gep->getNumOperands() <= 2) {
- Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
- << opNames[0];
- if (gep->getNumOperands() == 2)
- Out << ", " << opNames[1];
- } else {
- Out << "std::vector<Value*> " << iName << "_indices;";
- nl(Out);
- for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
- Out << iName << "_indices.push_back("
- << opNames[i] << ");";
- nl(Out);
+ Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+ << getCppName(gep->getSourceElementType()) << ", " << opNames[0] << ", {";
+ in();
+ for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+ if (i != 1) {
+ Out << ", ";
}
- Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
- << opNames[0] << ", " << iName << "_indices";
+ nl(Out);
+ Out << opNames[i];
}
- Out << ", \"";
+ out();
+ nl(Out) << "}, \"";
printEscapedString(gep->getName());
Out << "\", " << bbname << ");";
break;
@@ -1803,13 +1799,12 @@ void CppWriter::printFunctionBody(const Function *F) {
<< "->arg_begin();";
nl(Out);
}
- for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
- AI != AE; ++AI) {
- Out << "Value* " << getCppName(AI) << " = args++;";
+ for (const Argument &AI : F->args()) {
+ Out << "Value* " << getCppName(&AI) << " = args++;";
nl(Out);
- if (AI->hasName()) {
- Out << getCppName(AI) << "->setName(\"";
- printEscapedString(AI->getName());
+ if (AI.hasName()) {
+ Out << getCppName(&AI) << "->setName(\"";
+ printEscapedString(AI.getName());
Out << "\");";
nl(Out);
}
@@ -1818,29 +1813,25 @@ void CppWriter::printFunctionBody(const Function *F) {
// Create all the basic blocks
nl(Out);
- for (Function::const_iterator BI = F->begin(), BE = F->end();
- BI != BE; ++BI) {
- std::string bbname(getCppName(BI));
+ for (const BasicBlock &BI : *F) {
+ std::string bbname(getCppName(&BI));
Out << "BasicBlock* " << bbname <<
" = BasicBlock::Create(mod->getContext(), \"";
- if (BI->hasName())
- printEscapedString(BI->getName());
- Out << "\"," << getCppName(BI->getParent()) << ",0);";
+ if (BI.hasName())
+ printEscapedString(BI.getName());
+ Out << "\"," << getCppName(BI.getParent()) << ",0);";
nl(Out);
}
// Output all of its basic blocks... for the function
- for (Function::const_iterator BI = F->begin(), BE = F->end();
- BI != BE; ++BI) {
- std::string bbname(getCppName(BI));
- nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+ for (const BasicBlock &BI : *F) {
+ std::string bbname(getCppName(&BI));
+ nl(Out) << "// Block " << BI.getName() << " (" << bbname << ")";
nl(Out);
// Output all of the instructions in the basic block...
- for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
- I != E; ++I) {
- printInstruction(I,bbname);
- }
+ for (const Instruction &I : BI)
+ printInstruction(&I, bbname);
}
// Loop over the ForwardRefs and resolve them now that all instructions
@@ -1883,7 +1874,7 @@ void CppWriter::printInline(const std::string& fname,
printFunctionUses(F);
printFunctionBody(F);
is_inline = false;
- Out << "return " << getCppName(F->begin()) << ";";
+ Out << "return " << getCppName(&F->front()) << ";";
nl(Out) << "}";
nl(Out);
}
@@ -1896,17 +1887,14 @@ void CppWriter::printModuleBody() {
// Functions can call each other and global variables can reference them so
// define all the functions first before emitting their function bodies.
nl(Out) << "// Function Declarations"; nl(Out);
- for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
- I != E; ++I)
- printFunctionHead(I);
+ for (const Function &I : *TheModule)
+ printFunctionHead(&I);
// Process the global variables declarations. We can't initialze them until
// after the constants are printed so just print a header for each global
nl(Out) << "// Global Variable Declarations\n"; nl(Out);
- for (Module::const_global_iterator I = TheModule->global_begin(),
- E = TheModule->global_end(); I != E; ++I) {
- printVariableHead(I);
- }
+ for (const GlobalVariable &I : TheModule->globals())
+ printVariableHead(&I);
// Print out all the constants definitions. Constants don't recurse except
// through GlobalValues. All GlobalValues have been declared at this point
@@ -1918,21 +1906,18 @@ void CppWriter::printModuleBody() {
// been emitted. These definitions just couple the gvars with their constant
// initializers.
nl(Out) << "// Global Variable Definitions"; nl(Out);
- for (Module::const_global_iterator I = TheModule->global_begin(),
- E = TheModule->global_end(); I != E; ++I) {
- printVariableBody(I);
- }
+ for (const GlobalVariable &I : TheModule->globals())
+ printVariableBody(&I);
// Finally, we can safely put out all of the function bodies.
nl(Out) << "// Function Definitions"; nl(Out);
- for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
- I != E; ++I) {
- if (!I->isDeclaration()) {
- nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+ for (const Function &I : *TheModule) {
+ if (!I.isDeclaration()) {
+ nl(Out) << "// Function: " << I.getName() << " (" << getCppName(&I)
<< ")";
nl(Out) << "{";
nl(Out,1);
- printFunctionBody(I);
+ printFunctionBody(&I);
nl(Out,-1) << "}";
nl(Out);
}
diff --git a/lib/Target/Hexagon/AsmParser/CMakeLists.txt b/lib/Target/Hexagon/AsmParser/CMakeLists.txt
new file mode 100644
index 000000000000..bbfa92d59628
--- /dev/null
+++ b/lib/Target/Hexagon/AsmParser/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMHexagonAsmParser
+ HexagonAsmParser.cpp
+ )
+
+add_dependencies( LLVMHexagonAsmParser HexagonCommonTableGen )
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
new file mode 100644
index 000000000000..a8622a96527c
--- /dev/null
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -0,0 +1,2152 @@
+//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mcasmparser"
+
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCExpr.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
+#include "MCTargetDesc/HexagonShuffler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableFutureRegs("mfuture-regs",
+ cl::desc("Enable future registers"));
+
+static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
+cl::desc("Warn for missing parenthesis around predicate registers"),
+cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
+cl::desc("Error for missing parenthesis around predicate registers"),
+cl::init(false));
+static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
+cl::desc("Warn for mismatching a signed and unsigned value"),
+cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
+cl::desc("Warn for register names that arent contigious"),
+cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
+cl::desc("Error for register names that aren't contigious"),
+cl::init(false));
+
+
+namespace {
+struct HexagonOperand;
+
+class HexagonAsmParser : public MCTargetAsmParser {
+
+ HexagonTargetStreamer &getTargetStreamer() {
+ MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+ return static_cast<HexagonTargetStreamer &>(TS);
+ }
+
+ MCAsmParser &Parser;
+ MCAssembler *Assembler;
+ MCInstrInfo const &MCII;
+ MCInst MCB;
+ bool InBrackets;
+
+ MCAsmParser &getParser() const { return Parser; }
+ MCAssembler *getAssembler() const { return Assembler; }
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ bool equalIsAsmAssignment() override { return false; }
+ bool isLabel(AsmToken &Token) override;
+
+ void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+ bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+ bool ParseDirectiveFalign(unsigned Size, SMLoc L);
+
+ virtual bool ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
+ bool ParseDirectiveSubsection(SMLoc L);
+ bool ParseDirectiveValue(unsigned Size, SMLoc L);
+ bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+ bool RegisterMatchesArch(unsigned MatchNum) const;
+
+ bool matchBundleOptions();
+ bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc);
+ bool finishBundle(SMLoc IDLoc, MCStreamer &Out);
+ void canonicalizeImmediates(MCInst &MCI);
+ bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc,
+ OperandVector &InstOperands, uint64_t &ErrorInfo,
+ bool MatchingInlineAsm, bool &MustExtend);
+
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+
+ unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+ void OutOfRange(SMLoc IDLoc, long long Val, long long Max);
+ int processInstruction(MCInst &Inst, OperandVector const &Operands,
+ SMLoc IDLoc, bool &MustExtend);
+
+ // Check if we have an assembler and, if so, set the ELF e_header flags.
+ void chksetELFHeaderEFlags(unsigned flags) {
+ if (getAssembler())
+ getAssembler()->setELFHeaderEFlags(flags);
+ }
+
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "HexagonGenAsmMatcher.inc"
+
+ /// }
+
+public:
+ HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, _STI), Parser(_Parser),
+ MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) {
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+ MCAsmParserExtension::Initialize(_Parser);
+
+ Assembler = nullptr;
+ // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+ if (!Parser.getStreamer().hasRawTextSupport()) {
+ MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+ Assembler = &MES->getAssembler();
+ }
+ }
+
+ bool mustExtend(OperandVector &Operands);
+ bool splitIdentifier(OperandVector &Operands);
+ bool parseOperand(OperandVector &Operands);
+ bool parseInstruction(OperandVector &Operands);
+ bool implicitExpressionLocation(OperandVector &Operands);
+ bool parseExpressionOrOperand(OperandVector &Operands);
+ bool parseExpression(MCExpr const *& Expr);
+ virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override
+ {
+ llvm_unreachable("Unimplemented");
+ }
+ virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ AsmToken ID, OperandVector &Operands) override;
+
+ virtual bool ParseDirective(AsmToken DirectiveID) override;
+};
+
+/// HexagonOperand - Instances of this class represent a parsed Hexagon machine
+/// instruction.
+struct HexagonOperand : public MCParsedAsmOperand {
+ enum KindTy { Token, Immediate, Register } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ struct TokTy {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct RegTy {
+ unsigned RegNum;
+ };
+
+ struct ImmTy {
+ const MCExpr *Val;
+ bool MustExtend;
+ };
+
+ struct InstTy {
+ OperandVector *SubInsts;
+ };
+
+ union {
+ struct TokTy Tok;
+ struct RegTy Reg;
+ struct ImmTy Imm;
+ };
+
+ HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+public:
+ HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() {
+ Kind = o.Kind;
+ StartLoc = o.StartLoc;
+ EndLoc = o.EndLoc;
+ switch (Kind) {
+ case Register:
+ Reg = o.Reg;
+ break;
+ case Immediate:
+ Imm = o.Imm;
+ break;
+ case Token:
+ Tok = o.Tok;
+ break;
+ }
+ }
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const { return StartLoc; }
+
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const { return EndLoc; }
+
+ unsigned getReg() const {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNum;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ bool isToken() const { return Kind == Token; }
+ bool isImm() const { return Kind == Immediate; }
+ bool isMem() const { llvm_unreachable("No isMem"); }
+ bool isReg() const { return Kind == Register; }
+
+ bool CheckImmRange(int immBits, int zeroBits, bool isSigned,
+ bool isRelocatable, bool Extendable) const {
+ if (Kind == Immediate) {
+ const MCExpr *myMCExpr = getImm();
+ if (Imm.MustExtend && !Extendable)
+ return false;
+ int64_t Res;
+ if (myMCExpr->evaluateAsAbsolute(Res)) {
+ int bits = immBits + zeroBits;
+ // Field bit range is zerobits + bits
+ // zeroBits must be 0
+ if (Res & ((1 << zeroBits) - 1))
+ return false;
+ if (isSigned) {
+ if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1)))
+ return true;
+ } else {
+ if (bits == 64)
+ return true;
+ if (Res >= 0)
+ return ((uint64_t)Res < (uint64_t)(1ULL << bits)) ? true : false;
+ else {
+ const int64_t high_bit_set = 1ULL << 63;
+ const uint64_t mask = (high_bit_set >> (63 - bits));
+ return (((uint64_t)Res & mask) == mask) ? true : false;
+ }
+ }
+ } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable)
+ return true;
+ else if (myMCExpr->getKind() == MCExpr::Binary ||
+ myMCExpr->getKind() == MCExpr::Unary)
+ return true;
+ }
+ return false;
+ }
+
+ bool isf32Ext() const { return false; }
+ bool iss32Imm() const { return CheckImmRange(32, 0, true, true, false); }
+ bool iss8Imm() const { return CheckImmRange(8, 0, true, false, false); }
+ bool iss8Imm64() const { return CheckImmRange(8, 0, true, true, false); }
+ bool iss7Imm() const { return CheckImmRange(7, 0, true, false, false); }
+ bool iss6Imm() const { return CheckImmRange(6, 0, true, false, false); }
+ bool iss4Imm() const { return CheckImmRange(4, 0, true, false, false); }
+ bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
+ bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
+ bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
+ bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
+ bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
+ bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
+ bool iss3Imm() const { return CheckImmRange(3, 0, true, false, false); }
+
+ bool isu64Imm() const { return CheckImmRange(64, 0, false, true, true); }
+ bool isu32Imm() const { return CheckImmRange(32, 0, false, true, false); }
+ bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
+ bool isu16Imm() const { return CheckImmRange(16, 0, false, true, false); }
+ bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
+ bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
+ bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
+ bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
+ bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
+ bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+ bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+ bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+ bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
+ bool isu10Imm() const { return CheckImmRange(10, 0, false, false, false); }
+ bool isu9Imm() const { return CheckImmRange(9, 0, false, false, false); }
+ bool isu8Imm() const { return CheckImmRange(8, 0, false, false, false); }
+ bool isu7Imm() const { return CheckImmRange(7, 0, false, false, false); }
+ bool isu6Imm() const { return CheckImmRange(6, 0, false, false, false); }
+ bool isu5Imm() const { return CheckImmRange(5, 0, false, false, false); }
+ bool isu4Imm() const { return CheckImmRange(4, 0, false, false, false); }
+ bool isu3Imm() const { return CheckImmRange(3, 0, false, false, false); }
+ bool isu2Imm() const { return CheckImmRange(2, 0, false, false, false); }
+ bool isu1Imm() const { return CheckImmRange(1, 0, false, false, false); }
+
+ bool ism6Imm() const { return CheckImmRange(6, 0, false, false, false); }
+ bool isn8Imm() const { return CheckImmRange(8, 0, false, false, false); }
+
+ bool iss16Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
+ bool iss12Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
+ bool iss10Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
+ bool iss9Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
+ bool iss8Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
+ bool iss7Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
+ bool iss6Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
+ bool iss11_0Ext() const {
+ return CheckImmRange(11 + 26, 0, true, true, true);
+ }
+ bool iss11_1Ext() const {
+ return CheckImmRange(11 + 26, 1, true, true, true);
+ }
+ bool iss11_2Ext() const {
+ return CheckImmRange(11 + 26, 2, true, true, true);
+ }
+ bool iss11_3Ext() const {
+ return CheckImmRange(11 + 26, 3, true, true, true);
+ }
+
+ bool isu6Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+ bool isu7Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
+ bool isu8Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
+ bool isu9Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
+ bool isu10Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
+ bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+ bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
+ bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
+ bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
+ bool isu32MustExt() const { return isImm() && Imm.MustExtend; }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createExpr(getImm()));
+ }
+
+ void addSignedImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ MCExpr const *Expr = getImm();
+ int64_t Value;
+ if (!Expr->evaluateAsAbsolute(Value)) {
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ return;
+ }
+ int64_t Extended = SignExtend64 (Value, 32);
+ if ((Extended < 0) == (Value < 0)) {
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ return;
+ }
+ // Flip bit 33 to signal signed unsigned mismatch
+ Extended ^= 0x100000000;
+ Inst.addOperand(MCOperand::createImm(Extended));
+ }
+
+ void addf32ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds32ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8Imm64Operands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds6ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds3ImmOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+
+ void addu64ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu32ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu10ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu9ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu8ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu7ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu5ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu4ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu3ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu2ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu1ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void addm6ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addn8ImmOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds16ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds12ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds10ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds9ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds8ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds6ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+ void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
+ addSignedImmOperands(Inst, N);
+ }
+
+ void addu6ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu7ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu8ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu9ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu10ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+ void addu32MustExtOperands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
+ void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+ }
+
+ void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+ }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+
+ virtual void print(raw_ostream &OS) const;
+
+ static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) {
+ HexagonOperand *Op = new HexagonOperand(Token);
+ Op->Tok.Data = Str.data();
+ Op->Tok.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+
+ static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S,
+ SMLoc E) {
+ HexagonOperand *Op = new HexagonOperand(Register);
+ Op->Reg.RegNum = RegNum;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+
+ static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S,
+ SMLoc E) {
+ HexagonOperand *Op = new HexagonOperand(Immediate);
+ Op->Imm.Val = Val;
+ Op->Imm.MustExtend = false;
+ Op->StartLoc = S;
+ Op->EndLoc = E;
+ return std::unique_ptr<HexagonOperand>(Op);
+ }
+};
+
+} // end anonymous namespace.
+
+void HexagonOperand::print(raw_ostream &OS) const {
+ switch (Kind) {
+ case Immediate:
+ getImm()->print(OS, nullptr);
+ break;
+ case Register:
+ OS << "<register R";
+ OS << getReg() << ">";
+ break;
+ case Token:
+ OS << "'" << getToken() << "'";
+ break;
+ }
+}
+
+/// @name Auto-generated Match Functions
+static unsigned MatchRegisterName(StringRef Name);
+
+bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
+ DEBUG(dbgs() << "Bundle:");
+ DEBUG(MCB.dump_pretty(dbgs()));
+ DEBUG(dbgs() << "--\n");
+
+ // Check the bundle for errors.
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI);
+
+ bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(),
+ getContext(), MCB,
+ &Check);
+
+ while (Check.getNextErrInfo() == true) {
+ unsigned Reg = Check.getErrRegister();
+ Twine R(RI->getName(Reg));
+
+ uint64_t Err = Check.getError();
+ if (Err != HexagonMCErrInfo::CHECK_SUCCESS) {
+ if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err)
+ Error(IDLoc,
+ "unconditional branch cannot precede another branch in packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err ||
+ HexagonMCErrInfo::CHECK_ERROR_NEWV & Err)
+ Error(IDLoc, "register `" + R +
+ "' used with `.new' "
+ "but not validly modified in the same packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err)
+ Error(IDLoc, "register `" + R + "' modified more than once");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err)
+ Error(IDLoc, "cannot write to read-only register `" + R + "'");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err)
+ Error(IDLoc, "loop-setup and some branch instructions "
+ "cannot be in the same packet");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) {
+ Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1');
+ Error(IDLoc, "packet marked with `:endloop" + N + "' " +
+ "cannot contain instructions that modify register " +
+ "`" + R + "'");
+ }
+
+ if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err)
+ Error(IDLoc,
+ "instruction cannot appear in packet with other instructions");
+
+ if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err)
+ Error(IDLoc, "too many slots used in packet");
+
+ if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) {
+ uint64_t Erm = Check.getShuffleError();
+
+ if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm)
+ Error(IDLoc, "invalid instruction packet");
+ else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm)
+ Error(IDLoc, "invalid instruction packet: too many stores");
+ else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm)
+ Error(IDLoc, "invalid instruction packet: too many loads");
+ else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm)
+ Error(IDLoc, "too many branches in packet");
+ else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm)
+ Error(IDLoc, "invalid instruction packet: out of slots");
+ else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm)
+ Error(IDLoc, "invalid instruction packet: slot error");
+ else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm)
+ Error(IDLoc, "v60 packet violation");
+ else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm)
+ Error(IDLoc, "slot 0 instruction does not allow slot 1 store");
+ else
+ Error(IDLoc, "unknown error in instruction packet");
+ }
+ }
+
+ unsigned Warn = Check.getWarning();
+ if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) {
+ if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn)
+ Warning(IDLoc, "register `" + R + "' used with `.cur' "
+ "but not used in the same packet");
+ else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn)
+ Warning(IDLoc, "register `" + R + "' used with `.tmp' "
+ "but not used in the same packet");
+ }
+ }
+
+ if (CheckOk) {
+ MCB.setLoc(IDLoc);
+ if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
+ assert(!HexagonMCInstrInfo::isInnerLoop(MCB));
+ assert(!HexagonMCInstrInfo::isOuterLoop(MCB));
+ // Empty packets are valid yet aren't emitted
+ return false;
+ }
+ Out.EmitInstruction(MCB, getSTI());
+ } else {
+ // If compounding and duplexing didn't reduce the size below
+ // 4 or less we have a packet that is too big.
+ if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
+ Error(IDLoc, "invalid instruction packet: out of slots");
+ return true; // Error
+ }
+ }
+
+ return false; // No error
+}
+
+bool HexagonAsmParser::matchBundleOptions() {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ while (true) {
+ if (!Parser.getTok().is(AsmToken::Colon))
+ return false;
+ Lexer.Lex();
+ StringRef Option = Parser.getTok().getString();
+ if (Option.compare_lower("endloop0") == 0)
+ HexagonMCInstrInfo::setInnerLoop(MCB);
+ else if (Option.compare_lower("endloop1") == 0)
+ HexagonMCInstrInfo::setOuterLoop(MCB);
+ else if (Option.compare_lower("mem_noshuf") == 0)
+ HexagonMCInstrInfo::setMemReorderDisabled(MCB);
+ else if (Option.compare_lower("mem_shuf") == 0)
+ HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
+ else
+ return true;
+ Lexer.Lex();
+ }
+}
+
+// For instruction aliases, immediates are generated rather than
+// MCConstantExpr. Convert them for uniform MCExpr.
+// Also check for signed/unsigned mismatches and warn
+void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
+ MCInst NewInst;
+ NewInst.setOpcode(MCI.getOpcode());
+ for (MCOperand &I : MCI)
+ if (I.isImm()) {
+ int64_t Value (I.getImm());
+ if ((Value & 0x100000000) != (Value & 0x80000000)) {
+ // Detect flipped bit 33 wrt bit 32 and signal warning
+ Value ^= 0x100000000;
+ if (WarnSignedMismatch)
+ Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
+ }
+ NewInst.addOperand(MCOperand::createExpr(
+ MCConstantExpr::create(Value, getContext())));
+ }
+ else
+ NewInst.addOperand(I);
+ MCI = NewInst;
+}
+
+bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
+ OperandVector &InstOperands,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm,
+ bool &MustExtend) {
+ // Perform matching with tablegen asmmatcher generated function
+ int result =
+ MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm);
+ if (result == Match_Success) {
+ MCI.setLoc(IDLoc);
+ MustExtend = mustExtend(InstOperands);
+ canonicalizeImmediates(MCI);
+ result = processInstruction(MCI, InstOperands, IDLoc, MustExtend);
+
+ DEBUG(dbgs() << "Insn:");
+ DEBUG(MCI.dump_pretty(dbgs()));
+ DEBUG(dbgs() << "\n\n");
+
+ MCI.setLoc(IDLoc);
+ }
+
+ // Create instruction operand for bundle instruction
+ // Break this into a separate function Code here is less readable
+ // Think about how to get an instruction error to report correctly.
+ // SMLoc will return the "{"
+ switch (result) {
+ default:
+ break;
+ case Match_Success:
+ return false;
+ case Match_MissingFeature:
+ return Error(IDLoc, "invalid instruction");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction");
+ case Match_InvalidOperand:
+ SMLoc ErrorLoc = IDLoc;
+ if (ErrorInfo != ~0U) {
+ if (ErrorInfo >= InstOperands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get()))
+ ->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ }
+ return Error(ErrorLoc, "invalid operand for instruction");
+ }
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool HexagonAsmParser::mustExtend(OperandVector &Operands) {
+ unsigned Count = 0;
+ for (std::unique_ptr<MCParsedAsmOperand> &i : Operands)
+ if (i->isImm())
+ if (static_cast<HexagonOperand *>(i.get())->Imm.MustExtend)
+ ++Count;
+ // Multiple extenders should have been filtered by iss9Ext et. al.
+ assert(Count < 2 && "Multiple extenders");
+ return Count == 1;
+}
+
+bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ if (!InBrackets) {
+ MCB.clear();
+ MCB.addOperand(MCOperand::createImm(0));
+ }
+ HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]);
+ if (FirstOperand.isToken() && FirstOperand.getToken() == "{") {
+ assert(Operands.size() == 1 && "Brackets should be by themselves");
+ if (InBrackets) {
+ getParser().Error(IDLoc, "Already in a packet");
+ return true;
+ }
+ InBrackets = true;
+ return false;
+ }
+ if (FirstOperand.isToken() && FirstOperand.getToken() == "}") {
+ assert(Operands.size() == 1 && "Brackets should be by themselves");
+ if (!InBrackets) {
+ getParser().Error(IDLoc, "Not in a packet");
+ return true;
+ }
+ InBrackets = false;
+ if (matchBundleOptions())
+ return true;
+ return finishBundle(IDLoc, Out);
+ }
+ MCInst *SubInst = new (getParser().getContext()) MCInst;
+ bool MustExtend = false;
+ if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
+ MatchingInlineAsm, MustExtend))
+ return true;
+ HexagonMCInstrInfo::extendIfNeeded(
+ getParser().getContext(), MCII, MCB, *SubInst,
+ HexagonMCInstrInfo::isExtended(MCII, *SubInst) || MustExtend);
+ MCB.addOperand(MCOperand::createInst(SubInst));
+ if (!InBrackets)
+ return finishBundle(IDLoc, Out);
+ return false;
+}
+
+/// ParseDirective parses the Hexagon specific directives
+bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
+ return ParseDirectiveValue(4, DirectiveID.getLoc());
+ if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
+ IDVal.lower() == ".half")
+ return ParseDirectiveValue(2, DirectiveID.getLoc());
+ if (IDVal.lower() == ".falign")
+ return ParseDirectiveFalign(256, DirectiveID.getLoc());
+ if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
+ return ParseDirectiveComm(true, DirectiveID.getLoc());
+ if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common"))
+ return ParseDirectiveComm(false, DirectiveID.getLoc());
+ if (IDVal.lower() == ".subsection")
+ return ParseDirectiveSubsection(DirectiveID.getLoc());
+
+ return true;
+}
+bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
+ const MCExpr *Subsection = 0;
+ int64_t Res;
+
+ assert((getLexer().isNot(AsmToken::EndOfStatement)) &&
+ "Invalid subsection directive");
+ getParser().parseExpression(Subsection);
+
+ if (!Subsection->evaluateAsAbsolute(Res))
+ return Error(L, "Cannot evaluate subsection number");
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the
+ // negative subsections together and in the same order but at the opposite
+ // end of the section. Only legacy hexagon-gcc created assembly code
+ // used negative subsections.
+ if ((Res < 0) && (Res > -8193))
+ Subsection = MCConstantExpr::create(8192 + Res, this->getContext());
+
+ getStreamer().SubSection(Subsection);
+ return false;
+}
+
+/// ::= .falign [expression]
+bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
+
+ int64_t MaxBytesToFill = 15;
+
+ // if there is an arguement
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = L;
+
+ // Make sure we have a number (false is returned if expression is a number)
+ if (getParser().parseExpression(Value) == false) {
+ // Make sure this is a number that is in range
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
+ return Error(ExprLoc, "literal value out of range (256) for falign");
+ MaxBytesToFill = IntValue;
+ Lex();
+ } else {
+ return Error(ExprLoc, "not a valid expression for falign directive");
+ }
+ }
+
+ getTargetStreamer().emitFAlign(16, MaxBytesToFill);
+ Lex();
+
+ return false;
+}
+
+/// ::= .word [ expression (, expression)* ]
+bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+ for (;;) {
+ const MCExpr *Value;
+ SMLoc ExprLoc = L;
+ if (getParser().parseExpression(Value))
+ return true;
+
+ // Special case constant expressions to match code generator.
+ if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else
+ getStreamer().EmitValue(Value, Size);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("unexpected token in directive");
+ Lex();
+ }
+ }
+
+ Lex();
+ return false;
+}
+
+// This is largely a copy of AsmParser's ParseDirectiveComm extended to
+// accept a 3rd argument, AccessAlignment which indicates the smallest
+// memory access made to the symbol, expressed in bytes. If no
+// AccessAlignment is specified it defaults to the Alignment Value.
+// Hexagon's .lcomm:
+// .lcomm Symbol, Length, Alignment, AccessAlignment
+bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
+ // FIXME: need better way to detect if AsmStreamer (upstream removed
+ // getKind())
+ if (getStreamer().hasRawTextSupport())
+ return true; // Only object file output requires special treatment.
+
+ StringRef Name;
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+ // Handle the identifier as the key symbol.
+ MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("unexpected token in directive");
+ Lex();
+
+ int64_t Size;
+ SMLoc SizeLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Size))
+ return true;
+
+ int64_t ByteAlignment = 1;
+ SMLoc ByteAlignmentLoc;
+ if (getLexer().is(AsmToken::Comma)) {
+ Lex();
+ ByteAlignmentLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(ByteAlignment))
+ return true;
+ if (!isPowerOf2_64(ByteAlignment))
+ return Error(ByteAlignmentLoc, "alignment must be a power of 2");
+ }
+
+ int64_t AccessAlignment = 0;
+ if (getLexer().is(AsmToken::Comma)) {
+ // The optional access argument specifies the size of the smallest memory
+ // access to be made to the symbol, expressed in bytes.
+ SMLoc AccessAlignmentLoc;
+ Lex();
+ AccessAlignmentLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(AccessAlignment))
+ return true;
+
+ if (!isPowerOf2_64(AccessAlignment))
+ return Error(AccessAlignmentLoc, "access alignment must be a power of 2");
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+ Lex();
+
+ // NOTE: a size of zero for a .comm should create a undefined symbol
+ // but a size of .lcomm creates a bss symbol of size zero.
+ if (Size < 0)
+ return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+ "be less than zero");
+
+ // NOTE: The alignment in the directive is a power of 2 value, the assembler
+ // may internally end up wanting an alignment in bytes.
+ // FIXME: Diagnose overflow.
+ if (ByteAlignment < 0)
+ return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+ "alignment, can't be less than zero");
+
+ if (!Sym->isUndefined())
+ return Error(Loc, "invalid symbol redefinition");
+
+ HexagonMCELFStreamer &HexagonELFStreamer =
+ static_cast<HexagonMCELFStreamer &>(getStreamer());
+ if (IsLocal) {
+ HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment,
+ AccessAlignment);
+ return false;
+ }
+
+ HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment,
+ AccessAlignment);
+ return false;
+}
+
+// validate register against architecture
+bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+ return true;
+}
+
+// extern "C" void LLVMInitializeHexagonAsmLexer();
+
+/// Force static initialization.
+extern "C" void LLVMInitializeHexagonAsmParser() {
+ RegisterMCAsmParser<HexagonAsmParser> X(TheHexagonTarget);
+}
+
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_REGISTER_MATCHER
+#include "HexagonGenAsmMatcher.inc"
+
+namespace {
+bool previousEqual(OperandVector &Operands, size_t Index, StringRef String) {
+ if (Index >= Operands.size())
+ return false;
+ MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1];
+ if (!Operand.isToken())
+ return false;
+ return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String);
+}
+bool previousIsLoop(OperandVector &Operands, size_t Index) {
+ return previousEqual(Operands, Index, "loop0") ||
+ previousEqual(Operands, Index, "loop1") ||
+ previousEqual(Operands, Index, "sp1loop0") ||
+ previousEqual(Operands, Index, "sp2loop0") ||
+ previousEqual(Operands, Index, "sp3loop0");
+}
+}
+
+bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) {
+ AsmToken const &Token = getParser().getTok();
+ StringRef String = Token.getString();
+ SMLoc Loc = Token.getLoc();
+ getLexer().Lex();
+ do {
+ std::pair<StringRef, StringRef> HeadTail = String.split('.');
+ if (!HeadTail.first.empty())
+ Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc));
+ if (!HeadTail.second.empty())
+ Operands.push_back(HexagonOperand::CreateToken(
+ String.substr(HeadTail.first.size(), 1), Loc));
+ String = HeadTail.second;
+ } while (!String.empty());
+ return false;
+}
+
+bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
+ unsigned Register;
+ SMLoc Begin;
+ SMLoc End;
+ MCAsmLexer &Lexer = getLexer();
+ if (!ParseRegister(Register, Begin, End)) {
+ if (!ErrorMissingParenthesis)
+ switch (Register) {
+ default:
+ break;
+ case Hexagon::P0:
+ case Hexagon::P1:
+ case Hexagon::P2:
+ case Hexagon::P3:
+ if (previousEqual(Operands, 0, "if")) {
+ if (WarnMissingParenthesis)
+ Warning (Begin, "Missing parenthesis around predicate register");
+ static char const *LParen = "(";
+ static char const *RParen = ")";
+ Operands.push_back(HexagonOperand::CreateToken(LParen, Begin));
+ Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+ AsmToken MaybeDotNew = Lexer.getTok();
+ if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+ MaybeDotNew.getString().equals_lower(".new"))
+ splitIdentifier(Operands);
+ Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+ return false;
+ }
+ if (previousEqual(Operands, 0, "!") &&
+ previousEqual(Operands, 1, "if")) {
+ if (WarnMissingParenthesis)
+ Warning (Begin, "Missing parenthesis around predicate register");
+ static char const *LParen = "(";
+ static char const *RParen = ")";
+ Operands.insert(Operands.end () - 1,
+ HexagonOperand::CreateToken(LParen, Begin));
+ Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+ AsmToken MaybeDotNew = Lexer.getTok();
+ if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+ MaybeDotNew.getString().equals_lower(".new"))
+ splitIdentifier(Operands);
+ Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+ return false;
+ }
+ break;
+ }
+ Operands.push_back(HexagonOperand::CreateReg(
+ Register, Begin, End));
+ return false;
+ }
+ return splitIdentifier(Operands);
+}
+
+bool HexagonAsmParser::isLabel(AsmToken &Token) {
+ MCAsmLexer &Lexer = getLexer();
+ AsmToken const &Second = Lexer.getTok();
+ AsmToken Third = Lexer.peekTok();
+ StringRef String = Token.getString();
+ if (Token.is(AsmToken::TokenKind::LCurly) ||
+ Token.is(AsmToken::TokenKind::RCurly))
+ return false;
+ if (!Token.is(AsmToken::TokenKind::Identifier))
+ return true;
+ if (!MatchRegisterName(String.lower()))
+ return true;
+ (void)Second;
+ assert(Second.is(AsmToken::Colon));
+ StringRef Raw (String.data(), Third.getString().data() - String.data() +
+ Third.getString().size());
+ std::string Collapsed = Raw;
+ Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace),
+ Collapsed.end());
+ StringRef Whole = Collapsed;
+ std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
+ if (!MatchRegisterName(DotSplit.first.lower()))
+ return true;
+ return false;
+}
+
+bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) {
+ if (!Contigious && ErrorNoncontigiousRegister) {
+ Error(Loc, "Register name is not contigious");
+ return true;
+ }
+ if (!Contigious && WarnNoncontigiousRegister)
+ Warning(Loc, "Register name is not contigious");
+ return false;
+}
+
+bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+ MCAsmLexer &Lexer = getLexer();
+ StartLoc = getLexer().getLoc();
+ SmallVector<AsmToken, 5> Lookahead;
+ StringRef RawString(Lexer.getTok().getString().data(), 0);
+ bool Again = Lexer.is(AsmToken::Identifier);
+ bool NeededWorkaround = false;
+ while (Again) {
+ AsmToken const &Token = Lexer.getTok();
+ RawString = StringRef(RawString.data(),
+ Token.getString().data() - RawString.data () +
+ Token.getString().size());
+ Lookahead.push_back(Token);
+ Lexer.Lex();
+ bool Contigious = Lexer.getTok().getString().data() ==
+ Lookahead.back().getString().data() +
+ Lookahead.back().getString().size();
+ bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) ||
+ Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) ||
+ Lexer.is(AsmToken::Colon);
+ bool Workaround = Lexer.is(AsmToken::Colon) ||
+ Lookahead.back().is(AsmToken::Colon);
+ Again = (Contigious && Type) || (Workaround && Type);
+ NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
+ }
+ std::string Collapsed = RawString;
+ Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace),
+ Collapsed.end());
+ StringRef FullString = Collapsed;
+ std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
+ unsigned DotReg = MatchRegisterName(DotSplit.first.lower());
+ if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+ if (DotSplit.second.empty()) {
+ RegNo = DotReg;
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ } else {
+ RegNo = DotReg;
+ size_t First = RawString.find('.');
+ StringRef DotString (RawString.data() + First, RawString.size() - First);
+ Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString));
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ }
+ }
+ std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
+ unsigned ColonReg = MatchRegisterName(ColonSplit.first.lower());
+ if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ RegNo = ColonReg;
+ EndLoc = Lexer.getLoc();
+ if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+ return true;
+ return false;
+ }
+ while (!Lookahead.empty()) {
+ Lexer.UnLex(Lookahead.back());
+ Lookahead.pop_back();
+ }
+ return true;
+}
+
+bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) {
+ if (previousEqual(Operands, 0, "call"))
+ return true;
+ if (previousEqual(Operands, 0, "jump"))
+ if (!getLexer().getTok().is(AsmToken::Colon))
+ return true;
+ if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1))
+ return true;
+ if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") &&
+ (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t")))
+ return true;
+ return false;
+}
+
+bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) {
+ llvm::SmallVector<AsmToken, 4> Tokens;
+ MCAsmLexer &Lexer = getLexer();
+ bool Done = false;
+ static char const * Comma = ",";
+ do {
+ Tokens.emplace_back (Lexer.getTok());
+ Lexer.Lex();
+ switch (Tokens.back().getKind())
+ {
+ case AsmToken::TokenKind::Hash:
+ if (Tokens.size () > 1)
+ if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) {
+ Tokens.insert(Tokens.end() - 2,
+ AsmToken(AsmToken::TokenKind::Comma, Comma));
+ Done = true;
+ }
+ break;
+ case AsmToken::TokenKind::RCurly:
+ case AsmToken::TokenKind::EndOfStatement:
+ case AsmToken::TokenKind::Eof:
+ Done = true;
+ break;
+ default:
+ break;
+ }
+ } while (!Done);
+ while (!Tokens.empty()) {
+ Lexer.UnLex(Tokens.back());
+ Tokens.pop_back();
+ }
+ return getParser().parseExpression(Expr);
+}
+
+bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) {
+ if (implicitExpressionLocation(Operands)) {
+ MCAsmParser &Parser = getParser();
+ SMLoc Loc = Parser.getLexer().getLoc();
+ std::unique_ptr<HexagonOperand> Expr =
+ HexagonOperand::CreateImm(nullptr, Loc, Loc);
+ MCExpr const *& Val = Expr->Imm.Val;
+ Operands.push_back(std::move(Expr));
+ return parseExpression(Val);
+ }
+ return parseOperand(Operands);
+}
+
+/// Parse an instruction.
+bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ while (true) {
+ AsmToken const &Token = Parser.getTok();
+ switch (Token.getKind()) {
+ case AsmToken::EndOfStatement: {
+ Lexer.Lex();
+ return false;
+ }
+ case AsmToken::LCurly: {
+ if (!Operands.empty())
+ return true;
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lexer.Lex();
+ return false;
+ }
+ case AsmToken::RCurly: {
+ if (Operands.empty()) {
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lexer.Lex();
+ }
+ return false;
+ }
+ case AsmToken::Comma: {
+ Lexer.Lex();
+ continue;
+ }
+ case AsmToken::EqualEqual:
+ case AsmToken::ExclaimEqual:
+ case AsmToken::GreaterEqual:
+ case AsmToken::GreaterGreater:
+ case AsmToken::LessEqual:
+ case AsmToken::LessLess: {
+ Operands.push_back(HexagonOperand::CreateToken(
+ Token.getString().substr(0, 1), Token.getLoc()));
+ Operands.push_back(HexagonOperand::CreateToken(
+ Token.getString().substr(1, 1), Token.getLoc()));
+ Lexer.Lex();
+ continue;
+ }
+ case AsmToken::Hash: {
+ bool MustNotExtend = false;
+ bool ImplicitExpression = implicitExpressionLocation(Operands);
+ std::unique_ptr<HexagonOperand> Expr = HexagonOperand::CreateImm(
+ nullptr, Lexer.getLoc(), Lexer.getLoc());
+ if (!ImplicitExpression)
+ Operands.push_back(
+ HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+ Lexer.Lex();
+ bool MustExtend = false;
+ bool HiOnly = false;
+ bool LoOnly = false;
+ if (Lexer.is(AsmToken::Hash)) {
+ Lexer.Lex();
+ MustExtend = true;
+ } else if (ImplicitExpression)
+ MustNotExtend = true;
+ AsmToken const &Token = Parser.getTok();
+ if (Token.is(AsmToken::Identifier)) {
+ StringRef String = Token.getString();
+ AsmToken IDToken = Token;
+ if (String.lower() == "hi") {
+ HiOnly = true;
+ } else if (String.lower() == "lo") {
+ LoOnly = true;
+ }
+ if (HiOnly || LoOnly) {
+ AsmToken LParen = Lexer.peekTok();
+ if (!LParen.is(AsmToken::LParen)) {
+ HiOnly = false;
+ LoOnly = false;
+ } else {
+ Lexer.Lex();
+ }
+ }
+ }
+ if (parseExpression(Expr->Imm.Val))
+ return true;
+ int64_t Value;
+ MCContext &Context = Parser.getContext();
+ assert(Expr->Imm.Val != nullptr);
+ if (Expr->Imm.Val->evaluateAsAbsolute(Value)) {
+ if (HiOnly)
+ Expr->Imm.Val = MCBinaryExpr::createLShr(
+ Expr->Imm.Val, MCConstantExpr::create(16, Context), Context);
+ if (HiOnly || LoOnly)
+ Expr->Imm.Val = MCBinaryExpr::createAnd(
+ Expr->Imm.Val, MCConstantExpr::create(0xffff, Context), Context);
+ }
+ if (MustNotExtend)
+ Expr->Imm.Val = HexagonNoExtendOperand::Create(Expr->Imm.Val, Context);
+ Expr->Imm.MustExtend = MustExtend;
+ Operands.push_back(std::move(Expr));
+ continue;
+ }
+ default:
+ break;
+ }
+ if (parseExpressionOrOperand(Operands))
+ return true;
+ }
+}
+
+bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name,
+ AsmToken ID,
+ OperandVector &Operands) {
+ getLexer().UnLex(ID);
+ return parseInstruction(Operands);
+}
+
+namespace {
+MCInst makeCombineInst(int opCode, MCOperand &Rdd,
+ MCOperand &MO1, MCOperand &MO2) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(opCode);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(MO1);
+ TmpInst.addOperand(MO2);
+
+ return TmpInst;
+}
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+ unsigned Kind) {
+ HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp);
+
+ switch (Kind) {
+ case MCK_0: {
+ int64_t Value;
+ return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0
+ ? Match_Success
+ : Match_InvalidOperand;
+ }
+ case MCK_1: {
+ int64_t Value;
+ return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1
+ ? Match_Success
+ : Match_InvalidOperand;
+ }
+ case MCK__MINUS_1: {
+ int64_t Value;
+ return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == -1
+ ? Match_Success
+ : Match_InvalidOperand;
+ }
+ }
+ if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) {
+ StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length);
+ if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind)
+ return Match_Success;
+ if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind)
+ return Match_Success;
+ }
+
+ DEBUG(dbgs() << "Unmatched Operand:");
+ DEBUG(Op->dump());
+ DEBUG(dbgs() << "\n");
+
+ return Match_InvalidOperand;
+}
+
+void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
+ std::string errStr;
+ raw_string_ostream ES(errStr);
+ ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
+ if (Max >= 0)
+ ES << "0-" << Max;
+ else
+ ES << Max << "-" << (-Max - 1);
+ Error(IDLoc, ES.str().c_str());
+}
+
+int HexagonAsmParser::processInstruction(MCInst &Inst,
+ OperandVector const &Operands,
+ SMLoc IDLoc, bool &MustExtend) {
+ MCContext &Context = getParser().getContext();
+ const MCRegisterInfo *RI = getContext().getRegisterInfo();
+ std::string r = "r";
+ std::string v = "v";
+ std::string Colon = ":";
+
+ bool is32bit = false; // used to distinguish between CONST32 and CONST64
+ switch (Inst.getOpcode()) {
+ default:
+ break;
+
+ case Hexagon::M4_mpyrr_addr:
+ case Hexagon::S4_addi_asl_ri:
+ case Hexagon::S4_addi_lsr_ri:
+ case Hexagon::S4_andi_asl_ri:
+ case Hexagon::S4_andi_lsr_ri:
+ case Hexagon::S4_ori_asl_ri:
+ case Hexagon::S4_ori_lsr_ri:
+ case Hexagon::S4_or_andix:
+ case Hexagon::S4_subi_asl_ri:
+ case Hexagon::S4_subi_lsr_ri: {
+ MCOperand &Ry = Inst.getOperand(0);
+ MCOperand &src = Inst.getOperand(2);
+ if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg()))
+ return Match_InvalidOperand;
+ break;
+ }
+
+ case Hexagon::C2_cmpgei: {
+ MCOperand &MO = Inst.getOperand(2);
+ MO.setExpr(MCBinaryExpr::createSub(
+ MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+ Inst.setOpcode(Hexagon::C2_cmpgti);
+ break;
+ }
+
+ case Hexagon::C2_cmpgeui: {
+ MCOperand &MO = Inst.getOperand(2);
+ int64_t Value;
+ bool Success = MO.getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success && "Assured by matcher");
+ if (Value == 0) {
+ MCInst TmpInst;
+ MCOperand &Pd = Inst.getOperand(0);
+ MCOperand &Rt = Inst.getOperand(1);
+ TmpInst.setOpcode(Hexagon::C2_cmpeq);
+ TmpInst.addOperand(Pd);
+ TmpInst.addOperand(Rt);
+ TmpInst.addOperand(Rt);
+ Inst = TmpInst;
+ } else {
+ MO.setExpr(MCBinaryExpr::createSub(
+ MO.getExpr(), MCConstantExpr::create(1, Context), Context));
+ Inst.setOpcode(Hexagon::C2_cmpgtui);
+ }
+ break;
+ }
+ case Hexagon::J2_loop1r:
+ case Hexagon::J2_loop1i:
+ case Hexagon::J2_loop0r:
+ case Hexagon::J2_loop0i: {
+ MCOperand &MO = Inst.getOperand(0);
+ // Loop has different opcodes for extended vs not extended, but we should
+ // not use the other opcode as it is a legacy artifact of TD files.
+ int64_t Value;
+ if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+ // if the operand can fit within a 7:2 field
+ if (Value < (1 << 8) && Value >= -(1 << 8)) {
+ SMLoc myLoc = Operands[2]->getStartLoc();
+ // # is left in startLoc in the case of ##
+ // If '##' found then force extension.
+ if (*myLoc.getPointer() == '#') {
+ MustExtend = true;
+ break;
+ }
+ } else {
+ // If immediate and out of 7:2 range.
+ MustExtend = true;
+ }
+ }
+ break;
+ }
+
+ // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+ case Hexagon::A2_tfrp: {
+ MCOperand &MO = Inst.getOperand(1);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(MatchRegisterName(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + llvm::utostr_32(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+ Inst.setOpcode(Hexagon::A2_combinew);
+ break;
+ }
+
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf: {
+ MCOperand &MO = Inst.getOperand(2);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(MatchRegisterName(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + llvm::utostr_32(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+ Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+ ? Hexagon::C2_ccombinewt
+ : Hexagon::C2_ccombinewf);
+ break;
+ }
+ case Hexagon::A2_tfrptnew:
+ case Hexagon::A2_tfrpfnew: {
+ MCOperand &MO = Inst.getOperand(2);
+ unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+ std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+ StringRef Reg1(R1);
+ MO.setReg(MatchRegisterName(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + llvm::utostr_32(RegPairNum);
+ StringRef Reg2(R2);
+ Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+ Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+ ? Hexagon::C2_ccombinewnewt
+ : Hexagon::C2_ccombinewnewf);
+ break;
+ }
+
+ // Translate a "$Rx = CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
+ case Hexagon::CONST32:
+ case Hexagon::CONST32_Float_Real:
+ case Hexagon::CONST32_Int_Real:
+ case Hexagon::FCONST32_nsdata:
+ is32bit = true;
+ // Translate a "$Rx:y = CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) "
+ case Hexagon::CONST64_Float_Real:
+ case Hexagon::CONST64_Int_Real:
+
+ // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+ if (!Parser.getStreamer().hasRawTextSupport()) {
+ MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+ MCOperand &MO_1 = Inst.getOperand(1);
+ MCOperand &MO_0 = Inst.getOperand(0);
+
+ // push section onto section stack
+ MES->PushSection();
+
+ std::string myCharStr;
+ MCSectionELF *mySection;
+
+ // check if this as an immediate or a symbol
+ int64_t Value;
+ bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value);
+ if (Absolute) {
+ // Create a new section - one for each constant
+ // Some or all of the zeros are replaced with the given immediate.
+ if (is32bit) {
+ std::string myImmStr = utohexstr(static_cast<uint32_t>(Value));
+ myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000")
+ .drop_back(myImmStr.size())
+ .str() +
+ myImmStr;
+ } else {
+ std::string myImmStr = utohexstr(Value);
+ myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000")
+ .drop_back(myImmStr.size())
+ .str() +
+ myImmStr;
+ }
+
+ mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE);
+ } else if (MO_1.isExpr()) {
+ // .lita - for expressions
+ myCharStr = ".lita";
+ mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE);
+ } else
+ llvm_unreachable("unexpected type of machine operand!");
+
+ MES->SwitchSection(mySection);
+ unsigned byteSize = is32bit ? 4 : 8;
+ getStreamer().EmitCodeAlignment(byteSize, byteSize);
+
+ MCSymbol *Sym;
+
+ // for symbols, get rid of prepended ".gnu.linkonce.lx."
+
+ // emit symbol if needed
+ if (Absolute) {
+ Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16));
+ if (Sym->isUndefined()) {
+ getStreamer().EmitLabel(Sym);
+ getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+ getStreamer().EmitIntValue(Value, byteSize);
+ }
+ } else if (MO_1.isExpr()) {
+ const char *StringStart = 0;
+ const char *StringEnd = 0;
+ if (*Operands[4]->getStartLoc().getPointer() == '#') {
+ StringStart = Operands[5]->getStartLoc().getPointer();
+ StringEnd = Operands[6]->getStartLoc().getPointer();
+ } else { // no pound
+ StringStart = Operands[4]->getStartLoc().getPointer();
+ StringEnd = Operands[5]->getStartLoc().getPointer();
+ }
+
+ unsigned size = StringEnd - StringStart;
+ std::string DotConst = ".CONST_";
+ Sym = getContext().getOrCreateSymbol(DotConst +
+ StringRef(StringStart, size));
+
+ if (Sym->isUndefined()) {
+ // case where symbol is not yet defined: emit symbol
+ getStreamer().EmitLabel(Sym);
+ getStreamer().EmitSymbolAttribute(Sym, MCSA_Local);
+ getStreamer().EmitValue(MO_1.getExpr(), 4);
+ }
+ } else
+ llvm_unreachable("unexpected type of machine operand!");
+
+ MES->PopSection();
+
+ if (Sym) {
+ MCInst TmpInst;
+ if (is32bit) // 32 bit
+ TmpInst.setOpcode(Hexagon::L2_loadrigp);
+ else // 64 bit
+ TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+
+ TmpInst.addOperand(MO_0);
+ TmpInst.addOperand(
+ MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+ Inst = TmpInst;
+ }
+ }
+ break;
+
+ // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)"
+ case Hexagon::A2_tfrpi: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO = Inst.getOperand(1);
+ int64_t Value;
+ int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0;
+ MCOperand imm(MCOperand::createExpr(MCConstantExpr::create(sVal, Context)));
+ Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO);
+ break;
+ }
+
+ // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)"
+ case Hexagon::TFRI64_V4: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO = Inst.getOperand(1);
+ int64_t Value;
+ if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+ unsigned long long u64 = Value;
+ signed int s8 = (u64 >> 32) & 0xFFFFFFFF;
+ if (s8 < -128 || s8 > 127)
+ OutOfRange(IDLoc, s8, -128);
+ MCOperand imm(MCOperand::createExpr(
+ MCConstantExpr::create(s8, Context))); // upper 32
+ MCOperand imm2(MCOperand::createExpr(
+ MCConstantExpr::create(u64 & 0xFFFFFFFF, Context))); // lower 32
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2);
+ } else {
+ MCOperand imm(MCOperand::createExpr(
+ MCConstantExpr::create(0, Context))); // upper 32
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO);
+ }
+ break;
+ }
+
+ // Handle $Rdd = combine(##imm, #imm)"
+ case Hexagon::TFRI64_V2_ext: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO1 = Inst.getOperand(1);
+ MCOperand &MO2 = Inst.getOperand(2);
+ int64_t Value;
+ if (MO2.getExpr()->evaluateAsAbsolute(Value)) {
+ int s8 = Value;
+ if (s8 < -128 || s8 > 127)
+ OutOfRange(IDLoc, s8, -128);
+ }
+ Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2);
+ break;
+ }
+
+ // Handle $Rdd = combine(#imm, ##imm)"
+ case Hexagon::A4_combineii: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &MO1 = Inst.getOperand(1);
+ int64_t Value;
+ if (MO1.getExpr()->evaluateAsAbsolute(Value)) {
+ int s8 = Value;
+ if (s8 < -128 || s8 > 127)
+ OutOfRange(IDLoc, s8, -128);
+ }
+ MCOperand &MO2 = Inst.getOperand(2);
+ Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2);
+ break;
+ }
+
+ case Hexagon::S2_tableidxb_goodsyntax: {
+ Inst.setOpcode(Hexagon::S2_tableidxb);
+ break;
+ }
+
+ case Hexagon::S2_tableidxh_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(1, Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxh);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_tableidxw_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(2, Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxw);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_tableidxd_goodsyntax: {
+ MCInst TmpInst;
+ MCOperand &Rx = Inst.getOperand(0);
+ MCOperand &_dst_ = Inst.getOperand(1);
+ MCOperand &Rs = Inst.getOperand(2);
+ MCOperand &Imm4 = Inst.getOperand(3);
+ MCOperand &Imm6 = Inst.getOperand(4);
+ Imm6.setExpr(MCBinaryExpr::createSub(
+ Imm6.getExpr(), MCConstantExpr::create(3, Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_tableidxd);
+ TmpInst.addOperand(Rx);
+ TmpInst.addOperand(_dst_);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm4);
+ TmpInst.addOperand(Imm6);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::M2_mpyui: {
+ Inst.setOpcode(Hexagon::M2_mpyi);
+ break;
+ }
+ case Hexagon::M2_mpysmi: {
+ MCInst TmpInst;
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (!MustExtend) {
+ if (Value < 0 && Value > -256) {
+ Imm.setExpr(MCConstantExpr::create(Value * -1, Context));
+ TmpInst.setOpcode(Hexagon::M2_mpysin);
+ } else if (Value < 256 && Value >= 0)
+ TmpInst.setOpcode(Hexagon::M2_mpysip);
+ else
+ return Match_InvalidOperand;
+ } else {
+ if (Value >= 0)
+ TmpInst.setOpcode(Hexagon::M2_mpysip);
+ else
+ return Match_InvalidOperand;
+ }
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+ MCOperand &Imm = Inst.getOperand(2);
+ MCInst TmpInst;
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) { // convert to $Rd = $Rs
+ TmpInst.setOpcode(Hexagon::A2_tfr);
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ } else {
+ Imm.setExpr(MCBinaryExpr::createSub(
+ Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+ TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(Rs);
+ TmpInst.addOperand(Imm);
+ }
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
+ MCInst TmpInst;
+ unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+ std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+ StringRef Reg1(R1);
+ Rss.setReg(MatchRegisterName(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + llvm::utostr_32(RegPairNum);
+ StringRef Reg2(R2);
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+ Inst = TmpInst;
+ } else {
+ Imm.setExpr(MCBinaryExpr::createSub(
+ Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+ Inst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+ }
+ break;
+ }
+
+ case Hexagon::A4_boundscheck: {
+ MCOperand &Rs = Inst.getOperand(1);
+ unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+ if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+ Inst.setOpcode(Hexagon::A4_boundscheck_hi);
+ std::string Name =
+ r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+ StringRef RegPair = Name;
+ Rs.setReg(MatchRegisterName(RegPair));
+ } else { // raw:lo
+ Inst.setOpcode(Hexagon::A4_boundscheck_lo);
+ std::string Name =
+ r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+ StringRef RegPair = Name;
+ Rs.setReg(MatchRegisterName(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::A2_addsp: {
+ MCOperand &Rs = Inst.getOperand(1);
+ unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+ if (RegNum & 1) { // Odd mapped to raw:hi
+ Inst.setOpcode(Hexagon::A2_addsph);
+ std::string Name =
+ r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+ StringRef RegPair = Name;
+ Rs.setReg(MatchRegisterName(RegPair));
+ } else { // Even mapped raw:lo
+ Inst.setOpcode(Hexagon::A2_addspl);
+ std::string Name =
+ r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+ StringRef RegPair = Name;
+ Rs.setReg(MatchRegisterName(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_s1: {
+ MCOperand &Rt = Inst.getOperand(2);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to sat:raw:hi
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+ std::string Name =
+ r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ } else { // Even mapped sat:raw:lo
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+ std::string Name =
+ r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_acc_s1: {
+ MCInst TmpInst;
+ MCOperand &Rxx = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(2);
+ MCOperand &Rt = Inst.getOperand(3);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to sat:raw:hi
+ TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+ std::string Name =
+ r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ } else { // Even mapped sat:raw:lo
+ TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+ std::string Name =
+ r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ }
+ // Registers are in different positions
+ TmpInst.addOperand(Rxx);
+ TmpInst.addOperand(Rxx);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(Rt);
+ Inst = TmpInst;
+ break;
+ }
+
+ case Hexagon::M2_vrcmpys_s1rp: {
+ MCOperand &Rt = Inst.getOperand(2);
+ unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+ if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+ std::string Name =
+ r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ } else { // Even mapped rnd:sat:raw:lo
+ Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+ std::string Name =
+ r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum);
+ StringRef RegPair = Name;
+ Rt.setReg(MatchRegisterName(RegPair));
+ }
+ break;
+ }
+
+ case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0)
+ Inst.setOpcode(Hexagon::S2_vsathub);
+ else {
+ Imm.setExpr(MCBinaryExpr::createSub(
+ Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+ Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+ }
+ break;
+ }
+
+ case Hexagon::S5_vasrhrnd_goodsyntax: {
+ MCOperand &Rdd = Inst.getOperand(0);
+ MCOperand &Rss = Inst.getOperand(1);
+ MCOperand &Imm = Inst.getOperand(2);
+ int64_t Value;
+ bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);
+ (void)Absolute;
+ if (Value == 0) {
+ MCInst TmpInst;
+ unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+ std::string R1 = r + llvm::utostr_32(RegPairNum + 1);
+ StringRef Reg1(R1);
+ Rss.setReg(MatchRegisterName(Reg1));
+ // Add a new operand for the second register in the pair.
+ std::string R2 = r + llvm::utostr_32(RegPairNum);
+ StringRef Reg2(R2);
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(Rdd);
+ TmpInst.addOperand(Rss);
+ TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2)));
+ Inst = TmpInst;
+ } else {
+ Imm.setExpr(MCBinaryExpr::createSub(
+ Imm.getExpr(), MCConstantExpr::create(1, Context), Context));
+ Inst.setOpcode(Hexagon::S5_vasrhrnd);
+ }
+ break;
+ }
+
+ case Hexagon::A2_not: {
+ MCInst TmpInst;
+ MCOperand &Rd = Inst.getOperand(0);
+ MCOperand &Rs = Inst.getOperand(1);
+ TmpInst.setOpcode(Hexagon::A2_subri);
+ TmpInst.addOperand(Rd);
+ TmpInst.addOperand(
+ MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+ TmpInst.addOperand(Rs);
+ Inst = TmpInst;
+ break;
+ }
+ } // switch
+
+ return Match_Success;
+}
diff --git a/lib/Target/Hexagon/AsmParser/LLVMBuild.txt b/lib/Target/Hexagon/AsmParser/LLVMBuild.txt
new file mode 100644
index 000000000000..fdd875b61906
--- /dev/null
+++ b/lib/Target/Hexagon/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Hexagon/AsmParser/LLVMBuild.txt --------------*- Conf -*-===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = HexagonAsmParser
+parent = Hexagon
+required_libraries = MC MCParser Support HexagonDesc HexagonInfo
+add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/AsmParser/Makefile b/lib/Target/Hexagon/AsmParser/Makefile
new file mode 100644
index 000000000000..0aa0b4140c3e
--- /dev/null
+++ b/lib/Target/Hexagon/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Hexagon/AsmParser/Makefile ------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMHexagonAsmParser
+
+# Hack: we need to include 'main' Hexagon target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index cb7e633fb82f..ea96eb0ee10a 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -868,7 +868,7 @@ void BT::visitNonBranch(const MachineInstr *MI) {
continue;
bool Changed = false;
- if (!Eval || !ResMap.has(RD.Reg)) {
+ if (!Eval || ResMap.count(RD.Reg) == 0) {
// Set to "ref" (aka "bottom").
uint16_t DefBW = ME.getRegBitWidth(RD);
RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
@@ -951,11 +951,11 @@ void BT::visitBranchesFrom(const MachineInstr *BI) {
// be processed.
for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
const MachineBasicBlock *SB = *I;
- if (SB->isLandingPad())
+ if (SB->isEHPad())
Targets.insert(SB);
}
if (FallsThrough) {
- MachineFunction::const_iterator BIt = &B;
+ MachineFunction::const_iterator BIt = B.getIterator();
MachineFunction::const_iterator Next = std::next(BIt);
if (Next != MF.end())
Targets.insert(&*Next);
@@ -1005,7 +1005,7 @@ void BT::put(RegisterRef RR, const RegisterCell &RC) {
// Replace all references to bits from OldRR with the corresponding bits
// in NewRR.
void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
- assert(Map.has(OldRR.Reg) && "OldRR not present in map");
+ assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map");
BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
uint16_t OMB = OM.first(), OME = OM.last();
@@ -1104,9 +1104,9 @@ void BT::run() {
}
// If block end has been reached, add the fall-through edge to the queue.
if (It == End) {
- MachineFunction::const_iterator BIt = &B;
+ MachineFunction::const_iterator BIt = B.getIterator();
MachineFunction::const_iterator Next = std::next(BIt);
- if (Next != MF.end()) {
+ if (Next != MF.end() && B.isSuccessor(&*Next)) {
int ThisN = B.getNumber();
int NextN = Next->getNumber();
FlowQ.push(CFGEdge(ThisN, NextN));
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index ed002a794d66..959c8318fd60 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -36,9 +36,7 @@ struct BitTracker {
typedef SetVector<const MachineBasicBlock *> BranchTargetList;
- struct CellMapType : public std::map<unsigned,RegisterCell> {
- bool has(unsigned Reg) const;
- };
+ typedef std::map<unsigned, RegisterCell> CellMapType;
BitTracker(const MachineEvaluator &E, MachineFunction &F);
~BitTracker();
@@ -79,7 +77,6 @@ private:
// Abstraction of a reference to bit at position Pos from a register Reg.
struct BitTracker::BitRef {
BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
- BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {}
bool operator== (const BitRef &BR) const {
// If Reg is 0, disregard Pos.
return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
@@ -146,7 +143,6 @@ struct BitTracker::BitValue {
BitValue(ValueType T = Top) : Type(T) {}
BitValue(bool B) : Type(B ? One : Zero) {}
- BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {}
BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
bool operator== (const BitValue &V) const {
@@ -279,11 +275,6 @@ struct BitTracker::RegisterCell {
return !operator==(RC);
}
- const RegisterCell &operator=(const RegisterCell &RC) {
- Bits = RC.Bits;
- return *this;
- }
-
// Generate a "ref" cell for the corresponding register. In the resulting
// cell each bit will be described as being the same as the corresponding
// bit in register Reg (i.e. the cell is "defined" by register Reg).
@@ -344,11 +335,6 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) {
return RC;
}
-
-inline bool BitTracker::CellMapType::has(unsigned Reg) const {
- return find(Reg) != end();
-}
-
// A class to evaluate target's instructions and update the cell maps.
// This is used internally by the bit tracker. A target that wants to
// utilize this should implement the evaluation functions (noted below)
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 7ab2f0ba01df..181e4e3aa85d 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -1,5 +1,6 @@
set(LLVM_TARGET_DEFINITIONS Hexagon.td)
+tablegen(LLVM HexagonGenAsmMatcher.inc -gen-asm-matcher)
tablegen(LLVM HexagonGenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM HexagonGenCallingConv.inc -gen-callingconv)
tablegen(LLVM HexagonGenDAGISel.inc -gen-dag-isel)
@@ -14,16 +15,19 @@ add_public_tablegen_target(HexagonCommonTableGen)
add_llvm_target(HexagonCodeGen
BitTracker.cpp
HexagonAsmPrinter.cpp
+ HexagonBitSimplify.cpp
HexagonBitTracker.cpp
HexagonCFGOptimizer.cpp
HexagonCommonGEP.cpp
HexagonCopyToCombine.cpp
+ HexagonEarlyIfConv.cpp
HexagonExpandCondsets.cpp
HexagonExpandPredSpillCode.cpp
HexagonFixupHwLoops.cpp
HexagonFrameLowering.cpp
HexagonGenExtract.cpp
HexagonGenInsert.cpp
+ HexagonGenMux.cpp
HexagonGenPredicate.cpp
HexagonHardwareLoops.cpp
HexagonInstrInfo.cpp
@@ -33,17 +37,21 @@ add_llvm_target(HexagonCodeGen
HexagonMachineScheduler.cpp
HexagonMCInstLower.cpp
HexagonNewValueJump.cpp
+ HexagonOptimizeSZextends.cpp
HexagonPeephole.cpp
HexagonRegisterInfo.cpp
- HexagonRemoveSZExtArgs.cpp
HexagonSelectionDAGInfo.cpp
HexagonSplitConst32AndConst64.cpp
+ HexagonSplitDouble.cpp
+ HexagonStoreWidening.cpp
HexagonSubtarget.cpp
HexagonTargetMachine.cpp
HexagonTargetObjectFile.cpp
+ HexagonTargetTransformInfo.cpp
HexagonVLIWPacketizer.cpp
)
+add_subdirectory(AsmParser)
add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
add_subdirectory(Disassembler)
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 9cc1e944d359..4a9c3413cb29 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -7,42 +7,45 @@
//
//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-disassembler"
+
#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
-
-#include "llvm/MC/MCContext.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/raw_ostream.h"
-#include <array>
+#include "llvm/Support/TargetRegistry.h"
#include <vector>
using namespace llvm;
using namespace Hexagon;
-#define DEBUG_TYPE "hexagon-disassembler"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+typedef MCDisassembler::DecodeStatus DecodeStatus;
namespace {
/// \brief Hexagon disassembler for all Hexagon platforms.
class HexagonDisassembler : public MCDisassembler {
public:
+ std::unique_ptr<MCInstrInfo const> const MCII;
std::unique_ptr<MCInst *> CurrentBundle;
- HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx)
- : MCDisassembler(STI, Ctx), CurrentBundle(new MCInst *) {}
+ HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ MCInstrInfo const *MCII)
+ : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {}
DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -52,23 +55,57 @@ public:
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
+
+ void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
+ void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
};
}
-static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+// Forward declare these because the auto-generated code will reference them.
+// Definitions are further down.
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
- void const *Decoder);
+ const void *Decoder);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+ void const *Decoder);
static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
raw_ostream &os);
-static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst);
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t Address, const void *Decoder);
static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
const void *Decoder);
static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
@@ -95,129 +132,19 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
const void *Decoder);
static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
const void *Decoder);
-
-static const uint16_t IntRegDecoderTable[] = {
- Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
- Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
- Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
- Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
- Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
- Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
- Hexagon::R30, Hexagon::R31};
-
-static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
- Hexagon::P2, Hexagon::P3};
-
-static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
- const uint16_t Table[], size_t Size) {
- if (RegNo < Size) {
- Inst.addOperand(MCOperand::createReg(Table[RegNo]));
- return MCDisassembler::Success;
- } else
- return MCDisassembler::Fail;
-}
-
-static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- void const *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- unsigned Register = IntRegDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- const void *Decoder) {
- static const uint16_t CtrlRegDecoderTable[] = {
- Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
- Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6, Hexagon::C7,
- Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
- Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPCH};
-
- if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
- return MCDisassembler::Fail;
-
- if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
- return MCDisassembler::Fail;
-
- unsigned Register = CtrlRegDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- void const *Decoder) {
- static const uint16_t CtrlReg64DecoderTable[] = {
- Hexagon::C1_0, Hexagon::NoRegister, Hexagon::C3_2,
- Hexagon::NoRegister, Hexagon::NoRegister, Hexagon::NoRegister,
- Hexagon::C7_6, Hexagon::NoRegister, Hexagon::C9_8,
- Hexagon::NoRegister, Hexagon::C11_10, Hexagon::NoRegister,
- Hexagon::CS, Hexagon::NoRegister, Hexagon::UPC,
- Hexagon::NoRegister};
-
- if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
- return MCDisassembler::Fail;
-
- if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
- return MCDisassembler::Fail;
-
- unsigned Register = CtrlReg64DecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- const void *Decoder) {
- unsigned Register = 0;
- switch (RegNo) {
- case 0:
- Register = Hexagon::M0;
- break;
- case 1:
- Register = Hexagon::M1;
- break;
- default:
- return MCDisassembler::Fail;
- }
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- const void *Decoder) {
- static const uint16_t DoubleRegDecoderTable[] = {
- Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3,
- Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7,
- Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11,
- Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
-
- return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable,
- sizeof(DoubleRegDecoderTable)));
-}
-
-static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- void const *Decoder) {
- if (RegNo > 3)
- return MCDisassembler::Fail;
-
- unsigned Register = PredRegDecoderTable[RegNo];
- Inst.addOperand(MCOperand::createReg(Register));
- return MCDisassembler::Success;
-}
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder);
#include "HexagonGenDisassemblerTables.inc"
-static MCDisassembler *createHexagonDisassembler(Target const &T,
- MCSubtargetInfo const &STI,
+static MCDisassembler *createHexagonDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
MCContext &Ctx) {
- return new HexagonDisassembler(STI, Ctx);
+ return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
}
extern "C" void LLVMInitializeHexagonDisassembler() {
@@ -235,8 +162,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Size = 0;
*CurrentBundle = &MI;
- MI.setOpcode(Hexagon::BUNDLE);
- MI.addOperand(MCOperand::createImm(0));
+ MI = HexagonMCInstrInfo::createBundle();
while (Result == Success && Complete == false) {
if (Bytes.size() < HEXAGON_INSTR_SIZE)
return MCDisassembler::Fail;
@@ -246,7 +172,21 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Size += HEXAGON_INSTR_SIZE;
Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
}
- return Result;
+ if(Result == MCDisassembler::Fail)
+ return Result;
+ HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+ if(!Checker.check())
+ return MCDisassembler::Fail;
+ return MCDisassembler::Success;
+}
+
+namespace {
+HexagonDisassembler const &disassembler(void const *Decoder) {
+ return *static_cast<HexagonDisassembler const *>(Decoder);
+}
+MCContext &contextFromDecoder(void const *Decoder) {
+ return disassembler(Decoder).getContext();
+}
}
DecodeStatus HexagonDisassembler::getSingleInstruction(
@@ -255,8 +195,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
uint32_t Instruction =
- llvm::support::endian::read<uint32_t, llvm::support::little,
- llvm::support::unaligned>(Bytes.data());
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
if ((Instruction & HexagonII::INST_PARSE_MASK) ==
@@ -360,8 +299,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
MILow->setOpcode(opLow);
MCInst *MIHigh = new (getContext()) MCInst;
MIHigh->setOpcode(opHigh);
- AddSubinstOperands(MILow, opLow, instLow);
- AddSubinstOperands(MIHigh, opHigh, instHigh);
+ addSubinstOperands(MILow, opLow, instLow);
+ addSubinstOperands(MIHigh, opHigh, instHigh);
// see ConvertToSubInst() in
// lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -378,102 +317,774 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
// Calling the auto-generated decoder function.
Result =
decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
+
+ // If a, "standard" insn isn't found check special cases.
+ if (MCDisassembler::Success != Result ||
+ MI.getOpcode() == Hexagon::A4_ext) {
+ Result = decodeImmext(MI, Instruction, this);
+ if (MCDisassembler::Success != Result) {
+ Result = decodeSpecial(MI, Instruction);
+ }
+ } else {
+ // If the instruction is a compound instruction, register values will
+ // follow the duplex model, so the register values in the MCInst are
+ // incorrect. If the instruction is a compound, loop through the
+ // operands and change registers appropriately.
+ if (llvm::HexagonMCInstrInfo::getType(*MCII, MI) ==
+ HexagonII::TypeCOMPOUND) {
+ for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
+ if (i->isReg()) {
+ unsigned reg = i->getReg() - Hexagon::R0;
+ i->setReg(getRegFromSubinstEncoding(reg));
+ }
+ }
+ }
+ }
+ }
+
+ if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
+ unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
+ MCOperand &MCO = MI.getOperand(OpIndex);
+ assert(MCO.isReg() && "New value consumers must be registers");
+ unsigned Register =
+ getContext().getRegisterInfo()->getEncodingValue(MCO.getReg());
+ if ((Register & 0x6) == 0)
+ // HexagonPRM 10.11 Bit 1-2 == 0 is reserved
+ return MCDisassembler::Fail;
+ unsigned Lookback = (Register & 0x6) >> 1;
+ unsigned Offset = 1;
+ bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
+ auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+ auto i = Instructions.end() - 1;
+ for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
+ if (i == n)
+ // Couldn't find producer
+ return MCDisassembler::Fail;
+ if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst()))
+ // Skip scalars when calculating distances for vectors
+ ++Lookback;
+ if (HexagonMCInstrInfo::isImmext(*i->getInst()))
+ ++Lookback;
+ if (Offset == Lookback)
+ break;
+ }
+ auto const &Inst = *i->getInst();
+ bool SubregBit = (Register & 0x1) != 0;
+ if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) {
+ // If subreg bit is set we're selecting the second produced newvalue
+ unsigned Producer =
+ HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
+ assert(Producer != Hexagon::NoRegister);
+ MCO.setReg(Producer);
+ } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
+ unsigned Producer =
+ HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
+ if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+ Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
+ else if (SubregBit)
+ // Subreg bit should not be set for non-doublevector newvalue producers
+ return MCDisassembler::Fail;
+ assert(Producer != Hexagon::NoRegister);
+ MCO.setReg(Producer);
+ } else
+ return MCDisassembler::Fail;
}
+ adjustExtendedInstructions(MI, MCB);
+ MCInst const *Extender =
+ HexagonMCInstrInfo::extenderForIndex(MCB,
+ HexagonMCInstrInfo::bundleSize(MCB));
+ if(Extender != nullptr) {
+ MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
+ *MI.getOperand(1).getInst() : MI;
+ if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
+ !HexagonMCInstrInfo::isExtended(*MCII, Inst))
+ return MCDisassembler::Fail;
+ }
return Result;
}
+void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
+ MCInst const &MCB) const {
+ if (!HexagonMCInstrInfo::hasExtenderForIndex(
+ MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
+ unsigned opcode;
+ // This code is used by the disassembler to disambiguate between GP
+ // relative and absolute addressing instructions since they both have
+ // same encoding bits. However, an absolute addressing instruction must
+ // follow an immediate extender. Disassembler alwaus select absolute
+ // addressing instructions first and uses this code to change them into
+ // GP relative instruction in the absence of the corresponding immediate
+ // extender.
+ switch (MCI.getOpcode()) {
+ case Hexagon::S2_storerbabs:
+ opcode = Hexagon::S2_storerbgp;
+ break;
+ case Hexagon::S2_storerhabs:
+ opcode = Hexagon::S2_storerhgp;
+ break;
+ case Hexagon::S2_storerfabs:
+ opcode = Hexagon::S2_storerfgp;
+ break;
+ case Hexagon::S2_storeriabs:
+ opcode = Hexagon::S2_storerigp;
+ break;
+ case Hexagon::S2_storerbnewabs:
+ opcode = Hexagon::S2_storerbnewgp;
+ break;
+ case Hexagon::S2_storerhnewabs:
+ opcode = Hexagon::S2_storerhnewgp;
+ break;
+ case Hexagon::S2_storerinewabs:
+ opcode = Hexagon::S2_storerinewgp;
+ break;
+ case Hexagon::S2_storerdabs:
+ opcode = Hexagon::S2_storerdgp;
+ break;
+ case Hexagon::L4_loadrb_abs:
+ opcode = Hexagon::L2_loadrbgp;
+ break;
+ case Hexagon::L4_loadrub_abs:
+ opcode = Hexagon::L2_loadrubgp;
+ break;
+ case Hexagon::L4_loadrh_abs:
+ opcode = Hexagon::L2_loadrhgp;
+ break;
+ case Hexagon::L4_loadruh_abs:
+ opcode = Hexagon::L2_loadruhgp;
+ break;
+ case Hexagon::L4_loadri_abs:
+ opcode = Hexagon::L2_loadrigp;
+ break;
+ case Hexagon::L4_loadrd_abs:
+ opcode = Hexagon::L2_loadrdgp;
+ break;
+ default:
+ opcode = MCI.getOpcode();
+ }
+ MCI.setOpcode(opcode);
+ }
+}
+
+namespace llvm {
+extern const MCInstrDesc HexagonInsts[];
+}
+
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
+ ArrayRef<MCPhysReg> Table) {
+ if (RegNo < Table.size()) {
+ Inst.addOperand(MCOperand::createReg(Table[RegNo]));
+ return MCDisassembler::Success;
+ }
+
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ static const MCPhysReg IntRegDecoderTable[] = {
+ Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+ Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
+ Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+ Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+ Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+ Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+ Hexagon::R30, Hexagon::R31};
+
+ return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
+}
+
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecRegDecoderTable[] = {
+ Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
+ Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
+ Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+ Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19,
+ Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24,
+ Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
+ Hexagon::V30, Hexagon::V31};
+
+ return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
+}
+
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg DoubleRegDecoderTable[] = {
+ Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3,
+ Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7,
+ Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11,
+ Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
+
+ return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecDblRegDecoderTable[] = {
+ Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3,
+ Hexagon::W4, Hexagon::W5, Hexagon::W6, Hexagon::W7,
+ Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11,
+ Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+
+ return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
+}
+
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+ Hexagon::P2, Hexagon::P3};
+
+ return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+ Hexagon::Q2, Hexagon::Q3};
+
+ return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
+}
+
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg CtrlRegDecoderTable[] = {
+ Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
+ Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
+ Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
+ Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+ };
+
+ if (RegNo >= array_lengthof(CtrlRegDecoderTable))
+ return MCDisassembler::Fail;
+
+ if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = CtrlRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ static const MCPhysReg CtrlReg64DecoderTable[] = {
+ Hexagon::C1_0, Hexagon::NoRegister,
+ Hexagon::C3_2, Hexagon::NoRegister,
+ Hexagon::C7_6, Hexagon::NoRegister,
+ Hexagon::C9_8, Hexagon::NoRegister,
+ Hexagon::C11_10, Hexagon::NoRegister,
+ Hexagon::CS, Hexagon::NoRegister,
+ Hexagon::UPC, Hexagon::NoRegister
+ };
+
+ if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
+ return MCDisassembler::Fail;
+
+ if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+ return MCDisassembler::Fail;
+
+ unsigned Register = CtrlReg64DecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ unsigned Register = 0;
+ switch (RegNo) {
+ case 0:
+ Register = Hexagon::M0;
+ break;
+ case 1:
+ Register = Hexagon::M1;
+ break;
+ default:
+ return MCDisassembler::Fail;
+ }
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
+namespace {
+uint32_t fullValue(MCInstrInfo const &MCII,
+ MCInst &MCB,
+ MCInst &MI,
+ int64_t Value) {
+ MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+ MCB, HexagonMCInstrInfo::bundleSize(MCB));
+ if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+ return Value;
+ unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+ uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+ int64_t Bits;
+ bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+ assert(Success);(void)Success;
+ uint32_t Upper26 = static_cast<uint32_t>(Bits);
+ uint32_t Operand = Upper26 | Lower6;
+ return Operand;
+}
+template <size_t T>
+void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ int64_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, SignExtend64<T>(tmp));
+ int64_t Extended = SignExtend64<32>(FullValue);
+ HexagonMCInstrInfo::addConstant(MI, Extended,
+ Disassembler.getContext());
+}
+}
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/,
+ const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ int64_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, tmp);
+ assert(FullValue >= 0 && "Negative in unsigned decoder");
+ HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
+ return MCDisassembler::Success;
+}
+
static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<16>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<16>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<12>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<12>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<11>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<11>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<12>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
return MCDisassembler::Success;
}
static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<13>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<13>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<14>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<14>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<10>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<10>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
const void *Decoder) {
- uint64_t imm = SignExtend64<8>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<8>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<6>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<6>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<4>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<4>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<5>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<5>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<6>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<6>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
uint64_t /*Address*/, const void *Decoder) {
- uint64_t imm = SignExtend64<7>(tmp);
- MI.addOperand(MCOperand::createImm(imm));
+ signedDecoder<7>(MI, tmp, Decoder);
return MCDisassembler::Success;
}
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<10>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
+ uint64_t /*Address*/, const void *Decoder) {
+ signedDecoder<19>(MI, tmp, Decoder);
+ return MCDisassembler::Success;
+}
+
+// custom decoder for various jump/call immediates
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+ const void *Decoder) {
+ HexagonDisassembler const &Disassembler = disassembler(Decoder);
+ unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+ // r13_2 is not extendable, so if there are no extent bits, it's r13_2
+ if (Bits == 0)
+ Bits = 15;
+ uint32_t FullValue = fullValue(*Disassembler.MCII,
+ **Disassembler.CurrentBundle,
+ MI, SignExtend64(tmp, Bits));
+ int64_t Extended = SignExtend64<32>(FullValue) + Address;
+ if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
+ 0, 4))
+ HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+ return MCDisassembler::Success;
+}
+
+// Addressing mode dependent load store opcode map.
+// - If an insn is preceded by an extender the address is absolute.
+// - memw(##symbol) = r0
+// - If an insn is not preceded by an extender the address is GP relative.
+// - memw(gp + #symbol) = r0
+// Please note that the instructions must be ordered in the descending order
+// of their opcode.
+// HexagonII::INST_ICLASS_ST
+static const unsigned int StoreConditionalOpcodeData[][2] = {
+ {S4_pstorerdfnew_abs, 0xafc02084},
+ {S4_pstorerdtnew_abs, 0xafc02080},
+ {S4_pstorerdf_abs, 0xafc00084},
+ {S4_pstorerdt_abs, 0xafc00080},
+ {S4_pstorerinewfnew_abs, 0xafa03084},
+ {S4_pstorerinewtnew_abs, 0xafa03080},
+ {S4_pstorerhnewfnew_abs, 0xafa02884},
+ {S4_pstorerhnewtnew_abs, 0xafa02880},
+ {S4_pstorerbnewfnew_abs, 0xafa02084},
+ {S4_pstorerbnewtnew_abs, 0xafa02080},
+ {S4_pstorerinewf_abs, 0xafa01084},
+ {S4_pstorerinewt_abs, 0xafa01080},
+ {S4_pstorerhnewf_abs, 0xafa00884},
+ {S4_pstorerhnewt_abs, 0xafa00880},
+ {S4_pstorerbnewf_abs, 0xafa00084},
+ {S4_pstorerbnewt_abs, 0xafa00080},
+ {S4_pstorerifnew_abs, 0xaf802084},
+ {S4_pstoreritnew_abs, 0xaf802080},
+ {S4_pstorerif_abs, 0xaf800084},
+ {S4_pstorerit_abs, 0xaf800080},
+ {S4_pstorerhfnew_abs, 0xaf402084},
+ {S4_pstorerhtnew_abs, 0xaf402080},
+ {S4_pstorerhf_abs, 0xaf400084},
+ {S4_pstorerht_abs, 0xaf400080},
+ {S4_pstorerbfnew_abs, 0xaf002084},
+ {S4_pstorerbtnew_abs, 0xaf002080},
+ {S4_pstorerbf_abs, 0xaf000084},
+ {S4_pstorerbt_abs, 0xaf000080}};
+// HexagonII::INST_ICLASS_LD
+
+// HexagonII::INST_ICLASS_LD_ST_2
+static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000},
+ {L4_loadri_abs, 0x49800000},
+ {L4_loadruh_abs, 0x49600000},
+ {L4_loadrh_abs, 0x49400000},
+ {L4_loadrub_abs, 0x49200000},
+ {L4_loadrb_abs, 0x49000000},
+ {S2_storerdabs, 0x48c00000},
+ {S2_storerinewabs, 0x48a01000},
+ {S2_storerhnewabs, 0x48a00800},
+ {S2_storerbnewabs, 0x48a00000},
+ {S2_storeriabs, 0x48800000},
+ {S2_storerfabs, 0x48600000},
+ {S2_storerhabs, 0x48400000},
+ {S2_storerbabs, 0x48000000}};
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
+
+ unsigned MachineOpcode = 0;
+ unsigned LLVMOpcode = 0;
+
+ if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
+ for (size_t i = 0; i < NumCondS; ++i) {
+ if ((insn & StoreConditionalOpcodeData[i][1]) ==
+ StoreConditionalOpcodeData[i][1]) {
+ MachineOpcode = StoreConditionalOpcodeData[i][1];
+ LLVMOpcode = StoreConditionalOpcodeData[i][0];
+ break;
+ }
+ }
+ }
+ if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
+ for (size_t i = 0; i < NumLS; ++i) {
+ if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
+ MachineOpcode = LoadStoreOpcodeData[i][1];
+ LLVMOpcode = LoadStoreOpcodeData[i][0];
+ break;
+ }
+ }
+ }
+
+ if (MachineOpcode) {
+ unsigned Value = 0;
+ unsigned shift = 0;
+ MI.setOpcode(LLVMOpcode);
+ // Remove the parse bits from the insn.
+ insn &= ~HexagonII::INST_PARSE_MASK;
+
+ switch (LLVMOpcode) {
+ default:
+ return MCDisassembler::Fail;
+ break;
+
+ case Hexagon::S4_pstorerdf_abs:
+ case Hexagon::S4_pstorerdt_abs:
+ case Hexagon::S4_pstorerdfnew_abs:
+ case Hexagon::S4_pstorerdtnew_abs: {
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, 0);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Rtt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+
+ case Hexagon::S4_pstorerbnewf_abs:
+ case Hexagon::S4_pstorerbnewt_abs:
+ case Hexagon::S4_pstorerbnewfnew_abs:
+ case Hexagon::S4_pstorerbnewtnew_abs:
+ case Hexagon::S4_pstorerhnewf_abs:
+ case Hexagon::S4_pstorerhnewt_abs:
+ case Hexagon::S4_pstorerhnewfnew_abs:
+ case Hexagon::S4_pstorerhnewtnew_abs:
+ case Hexagon::S4_pstorerinewf_abs:
+ case Hexagon::S4_pstorerinewt_abs:
+ case Hexagon::S4_pstorerinewfnew_abs:
+ case Hexagon::S4_pstorerinewtnew_abs: {
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, 0);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Nt
+ Value = (insn >> 8) & UINT64_C(7);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+
+ case Hexagon::S4_pstorerbf_abs:
+ case Hexagon::S4_pstorerbt_abs:
+ case Hexagon::S4_pstorerbfnew_abs:
+ case Hexagon::S4_pstorerbtnew_abs:
+ case Hexagon::S4_pstorerhf_abs:
+ case Hexagon::S4_pstorerht_abs:
+ case Hexagon::S4_pstorerhfnew_abs:
+ case Hexagon::S4_pstorerhtnew_abs:
+ case Hexagon::S4_pstorerif_abs:
+ case Hexagon::S4_pstorerit_abs:
+ case Hexagon::S4_pstorerifnew_abs:
+ case Hexagon::S4_pstoreritnew_abs: {
+ // op: Pv
+ Value = insn & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, 0);
+ // op: u6
+ Value = (insn >> 12) & UINT64_C(48);
+ Value |= (insn >> 3) & UINT64_C(15);
+ MI.addOperand(MCOperand::createImm(Value));
+ // op: Rt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+
+ case Hexagon::L4_ploadrdf_abs:
+ case Hexagon::L4_ploadrdt_abs:
+ case Hexagon::L4_ploadrdfnew_abs:
+ case Hexagon::L4_ploadrdtnew_abs: {
+ // op: Rdd
+ Value = insn & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+ // op: Pt
+ Value = ((insn >> 9) & UINT64_C(3));
+ DecodePredRegsRegisterClass(MI, Value, 0, 0);
+ // op: u6
+ Value = ((insn >> 15) & UINT64_C(62));
+ Value |= ((insn >> 8) & UINT64_C(1));
+ MI.addOperand(MCOperand::createImm(Value));
+ break;
+ }
+
+ case Hexagon::L4_ploadrbf_abs:
+ case Hexagon::L4_ploadrbt_abs:
+ case Hexagon::L4_ploadrbfnew_abs:
+ case Hexagon::L4_ploadrbtnew_abs:
+ case Hexagon::L4_ploadrhf_abs:
+ case Hexagon::L4_ploadrht_abs:
+ case Hexagon::L4_ploadrhfnew_abs:
+ case Hexagon::L4_ploadrhtnew_abs:
+ case Hexagon::L4_ploadrubf_abs:
+ case Hexagon::L4_ploadrubt_abs:
+ case Hexagon::L4_ploadrubfnew_abs:
+ case Hexagon::L4_ploadrubtnew_abs:
+ case Hexagon::L4_ploadruhf_abs:
+ case Hexagon::L4_ploadruht_abs:
+ case Hexagon::L4_ploadruhfnew_abs:
+ case Hexagon::L4_ploadruhtnew_abs:
+ case Hexagon::L4_ploadrif_abs:
+ case Hexagon::L4_ploadrit_abs:
+ case Hexagon::L4_ploadrifnew_abs:
+ case Hexagon::L4_ploadritnew_abs:
+ // op: Rd
+ Value = insn & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ // op: Pt
+ Value = (insn >> 9) & UINT64_C(3);
+ DecodePredRegsRegisterClass(MI, Value, 0, 0);
+ // op: u6
+ Value = (insn >> 15) & UINT64_C(62);
+ Value |= (insn >> 8) & UINT64_C(1);
+ MI.addOperand(MCOperand::createImm(Value));
+ break;
+
+ // op: g16_2
+ case (Hexagon::L4_loadri_abs):
+ ++shift;
+ // op: g16_1
+ case Hexagon::L4_loadrh_abs:
+ case Hexagon::L4_loadruh_abs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::L4_loadrb_abs:
+ case Hexagon::L4_loadrub_abs: {
+ // op: Rd
+ Value |= insn & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(511);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ break;
+ }
+
+ case Hexagon::L4_loadrd_abs: {
+ Value = insn & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(511);
+ MI.addOperand(MCOperand::createImm(Value << 3));
+ break;
+ }
+
+ case Hexagon::S2_storerdabs: {
+ // op: g16_3
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << 3));
+ // op: Rtt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeDoubleRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+
+ // op: g16_2
+ case Hexagon::S2_storerinewabs:
+ ++shift;
+ // op: g16_1
+ case Hexagon::S2_storerhnewabs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::S2_storerbnewabs: {
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ // op: Nt
+ Value = (insn >> 8) & UINT64_C(7);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+
+ // op: g16_2
+ case Hexagon::S2_storeriabs:
+ ++shift;
+ // op: g16_1
+ case Hexagon::S2_storerhabs:
+ case Hexagon::S2_storerfabs:
+ ++shift;
+ // op: g16_0
+ case Hexagon::S2_storerbabs: {
+ Value = (insn >> 11) & UINT64_C(49152);
+ Value |= (insn >> 7) & UINT64_C(15872);
+ Value |= (insn >> 5) & UINT64_C(256);
+ Value |= insn & UINT64_C(255);
+ MI.addOperand(MCOperand::createImm(Value << shift));
+ // op: Rt
+ Value = (insn >> 8) & UINT64_C(31);
+ DecodeIntRegsRegisterClass(MI, Value, 0, 0);
+ break;
+ }
+ }
+ return MCDisassembler::Success;
+ }
+ return MCDisassembler::Fail;
+}
+
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+ void const *Decoder) {
+
+ // Instruction Class for a constant a extender: bits 31:28 = 0x0000
+ if ((~insn & 0xf0000000) == 0xf0000000) {
+ unsigned Value;
+ // 27:16 High 12 bits of 26-bit extender.
+ Value = (insn & 0x0fff0000) << 4;
+ // 13:0 Low 14 bits of 26-bit extender.
+ Value |= ((insn & 0x3fff) << 6);
+ MI.setOpcode(Hexagon::A4_ext);
+ HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
+ return MCDisassembler::Success;
+ }
+ return MCDisassembler::Fail;
+}
+
// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
enum subInstBinaryValues {
V4_SA1_addi_BITS = 0x0000,
@@ -731,6 +1342,8 @@ static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
return Hexagon::R0 + encoded_reg;
else if (encoded_reg < 16)
return Hexagon::R0 + encoded_reg + 8;
+
+ // patently false value
return Hexagon::NoRegister;
}
@@ -739,10 +1352,13 @@ static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
return Hexagon::D0 + encoded_dreg;
else if (encoded_dreg < 8)
return Hexagon::D0 + encoded_dreg + 4;
+
+ // patently false value
return Hexagon::NoRegister;
}
-static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
+void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
+ unsigned inst) const {
int64_t operand;
MCOperand Op;
switch (opcode) {
@@ -762,8 +1378,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
case Hexagon::V4_SS2_allocframe:
// u 8-4{5_3}
operand = ((inst & 0x1f0) >> 4) << 3;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL1_loadri_io:
// Rd 3-0, Rs 7-4, u 11-8{4_2}
@@ -774,8 +1389,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0xf00) >> 6;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL1_loadrub_io:
// Rd 3-0, Rs 7-4, u 11-8
@@ -786,8 +1400,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0xf00) >> 8;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL2_loadrb_io:
// Rd 3-0, Rs 7-4, u 10-8
@@ -798,8 +1411,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0x700) >> 8;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL2_loadrh_io:
case Hexagon::V4_SL2_loadruh_io:
@@ -811,8 +1423,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0x700) >> 8) << 1;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL2_loadrd_sp:
// Rdd 2-0, u 7-3{5_3}
@@ -820,8 +1431,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0x0f8) >> 3) << 3;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SL2_loadri_sp:
// Rd 3-0, u 8-4{5_2}
@@ -829,8 +1439,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0x1f0) >> 4) << 2;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_addi:
// Rx 3-0 (x2), s7 10-4
@@ -839,8 +1448,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
MI->addOperand(Op);
MI->addOperand(Op);
operand = SignExtend64<7>((inst & 0x7f0) >> 4);
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_addrx:
// Rx 3-0 (x2), Rs 7-4
@@ -873,8 +1481,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0x3f0) >> 4) << 2;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_seti:
// Rd 3-0, u 9-4
@@ -882,8 +1489,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0x3f0) >> 4;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_clrf:
case Hexagon::V4_SA1_clrfnew:
@@ -901,8 +1507,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = inst & 0x3;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_combine0i:
case Hexagon::V4_SA1_combine1i:
@@ -913,8 +1518,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0x060) >> 5;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SA1_combinerz:
case Hexagon::V4_SA1_combinezr:
@@ -932,8 +1536,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0xf00) >> 8;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
operand = getRegFromSubinstEncoding(inst & 0xf);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
@@ -944,8 +1547,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0xf00) >> 8) << 2;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
operand = getRegFromSubinstEncoding(inst & 0xf);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
@@ -957,8 +1559,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = inst & 0xf;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SS2_storewi0:
case Hexagon::V4_SS2_storewi1:
@@ -967,25 +1568,23 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = (inst & 0xf) << 2;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
break;
case Hexagon::V4_SS2_stored_sp:
// s 8-3{6_3}, Rtt 2-0
operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
operand = getDRegFromSubinstEncoding(inst & 0x7);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
+ break;
case Hexagon::V4_SS2_storeh_io:
// Rs 7-4, u 10-8{3_1}, Rt 3-0
operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
operand = ((inst & 0x700) >> 8) << 1;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
operand = getRegFromSubinstEncoding(inst & 0xf);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
@@ -993,8 +1592,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
case Hexagon::V4_SS2_storew_sp:
// u 8-4{5_2}, Rd 3-0
operand = ((inst & 0x1f0) >> 4) << 2;
- Op = MCOperand::createImm(operand);
- MI->addOperand(Op);
+ HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
operand = getRegFromSubinstEncoding(inst & 0xf);
Op = MCOperand::createReg(operand);
MI->addOperand(Op);
diff --git a/lib/Target/Hexagon/Disassembler/LLVMBuild.txt b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
index 43bace75a852..6c251020f818 100644
--- a/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = HexagonDisassembler
parent = Hexagon
-required_libraries = HexagonDesc HexagonInfo MCDisassembler Support
+required_libraries = HexagonDesc HexagonInfo MC MCDisassembler Support
add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index d360be2aa5b2..ed7d9578902e 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -47,15 +47,8 @@
#include "llvm/Target/TargetMachine.h"
namespace llvm {
- class MachineInstr;
- class MCInst;
- class MCInstrInfo;
- class HexagonAsmPrinter;
class HexagonTargetMachine;
- void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
- HexagonAsmPrinter &AP);
-
/// \brief Creates a Hexagon-specific Target Transformation Info pass.
ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
} // end namespace llvm;
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 53a687c337ec..1189cfd488ee 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -24,14 +24,32 @@ include "llvm/Target/Target.td"
// Hexagon Architectures
def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Hexagon V4">;
def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Hexagon V5">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+
+// Hexagon ISA Extensions
+def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps",
+ "true", "Hexagon HVX instructions">;
+def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps",
+ "true", "Hexagon HVX Double instructions">;
//===----------------------------------------------------------------------===//
// Hexagon Instruction Predicate Definitions.
//===----------------------------------------------------------------------===//
-def HasV5T : Predicate<"HST->hasV5TOps()">;
-def NoV5T : Predicate<"!HST->hasV5TOps()">;
-def UseMEMOP : Predicate<"HST->useMemOps()">;
-def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">;
+def HasV5T : Predicate<"HST->hasV5TOps()">;
+def NoV5T : Predicate<"!HST->hasV5TOps()">;
+def HasV55T : Predicate<"HST->hasV55TOps()">,
+ AssemblerPredicate<"ArchV55">;
+def HasV60T : Predicate<"HST->hasV60TOps()">,
+ AssemblerPredicate<"ArchV60">;
+def UseMEMOP : Predicate<"HST->useMemOps()">;
+def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">;
+def UseHVXDbl : Predicate<"HST->useHVXDblOps()">,
+ AssemblerPredicate<"ExtensionHVXDbl">;
+def UseHVXSgl : Predicate<"HST->useHVXSglOps()">;
+
+def UseHVX : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
+ AssemblerPredicate<"ExtensionHVX">;
//===----------------------------------------------------------------------===//
// Classes used for relation maps.
@@ -53,6 +71,7 @@ class NewValueRel: PredNewRel;
// NewValueRel - Filter class used to relate load/store instructions having
// different addressing modes with each other.
class AddrModeRel: NewValueRel;
+class IntrinsicsRel;
//===----------------------------------------------------------------------===//
// Generate mapping table to relate non-predicate instructions with their
@@ -62,7 +81,7 @@ class AddrModeRel: NewValueRel;
def getPredOpcode : InstrMapping {
let FilterClass = "PredRel";
// Instructions with the same BaseOpcode and isNVStore values form a row.
- let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"];
+ let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
// Instructions with the same predicate sense form a column.
let ColFields = ["PredSense"];
// The key column is the unpredicated instructions.
@@ -77,7 +96,7 @@ def getPredOpcode : InstrMapping {
//
def getFalsePredOpcode : InstrMapping {
let FilterClass = "PredRel";
- let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+ let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
let ColFields = ["PredSense"];
let KeyCol = ["true"];
let ValueCols = [["false"]];
@@ -89,7 +108,7 @@ def getFalsePredOpcode : InstrMapping {
//
def getTruePredOpcode : InstrMapping {
let FilterClass = "PredRel";
- let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+ let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
let ColFields = ["PredSense"];
let KeyCol = ["false"];
let ValueCols = [["true"]];
@@ -125,7 +144,7 @@ def getPredOldOpcode : InstrMapping {
//
def getNewValueOpcode : InstrMapping {
let FilterClass = "NewValueRel";
- let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
+ let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
let ColFields = ["NValueST"];
let KeyCol = ["false"];
let ValueCols = [["true"]];
@@ -137,16 +156,16 @@ def getNewValueOpcode : InstrMapping {
//
def getNonNVStore : InstrMapping {
let FilterClass = "NewValueRel";
- let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"];
+ let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
let ColFields = ["NValueST"];
let KeyCol = ["true"];
let ValueCols = [["false"]];
}
-def getBasedWithImmOffset : InstrMapping {
+def getBaseWithImmOffset : InstrMapping {
let FilterClass = "AddrModeRel";
let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
- "isMEMri", "isFloat"];
+ "isFloat"];
let ColFields = ["addrMode"];
let KeyCol = ["Absolute"];
let ValueCols = [["BaseImmOffset"]];
@@ -168,6 +187,37 @@ def getRegForm : InstrMapping {
let ValueCols = [["reg"]];
}
+def getRegShlForm : InstrMapping {
+ let FilterClass = "ImmRegShl";
+ let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+ let ColFields = ["InputType"];
+ let KeyCol = ["imm"];
+ let ValueCols = [["reg"]];
+}
+
+def notTakenBranchPrediction : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"];
+ let ColFields = ["isBrTaken"];
+ let KeyCol = ["true"];
+ let ValueCols = [["false"]];
+}
+
+def takenBranchPrediction : InstrMapping {
+ let FilterClass = "PredRel";
+ let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"];
+ let ColFields = ["isBrTaken"];
+ let KeyCol = ["false"];
+ let ValueCols = [["true"]];
+}
+
+def getRealHWInstr : InstrMapping {
+ let FilterClass = "IntrinsicsRel";
+ let RowFields = ["BaseOpcode"];
+ let ColFields = ["InstrType"];
+ let KeyCol = ["Pseudo"];
+ let ValueCols = [["Pseudo"], ["Real"]];
+}
//===----------------------------------------------------------------------===//
// Register File, Calling Conv, Instruction Descriptions
//===----------------------------------------------------------------------===//
@@ -192,12 +242,22 @@ def : Proc<"hexagonv4", HexagonModelV4,
[ArchV4]>;
def : Proc<"hexagonv5", HexagonModelV4,
[ArchV4, ArchV5]>;
+def : Proc<"hexagonv55", HexagonModelV55,
+ [ArchV4, ArchV5, ArchV55]>;
+def : Proc<"hexagonv60", HexagonModelV60,
+ [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>;
//===----------------------------------------------------------------------===//
// Declare the target which we are implementing
//===----------------------------------------------------------------------===//
+def HexagonAsmParserVariant : AsmParserVariant {
+ int Variant = 0;
+ string TokenizingCharacters = "#()=:.<>!+*";
+}
+
def Hexagon : Target {
// Pull in Instruction Info:
let InstructionSet = HexagonInstrInfo;
+ let AssemblyParserVariants = [HexagonAsmParserVariant];
}
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 05728d2b627e..e213089687e8 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -40,11 +40,13 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
@@ -56,12 +58,27 @@
using namespace llvm;
+namespace llvm {
+ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
#define DEBUG_TYPE "asm-printer"
static cl::opt<bool> AlignCalls(
"hexagon-align-calls", cl::Hidden, cl::init(true),
cl::desc("Insert falign after call instruction for Hexagon target"));
+// Given a scalar register return its pair.
+inline static unsigned getHexagonRegisterPair(unsigned Reg,
+ const MCRegisterInfo *RI) {
+ assert(Hexagon::IntRegsRegClass.contains(Reg));
+ MCSuperRegIterator SR(Reg, RI, false);
+ unsigned Pair = *SR;
+ assert(Hexagon::DoubleRegsRegClass.contains(Pair));
+ return Pair;
+}
+
HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {}
@@ -102,9 +119,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
//
bool HexagonAsmPrinter::
isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
- if (MBB->hasAddressTaken()) {
+ if (MBB->hasAddressTaken())
return false;
- }
return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
}
@@ -117,7 +133,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
raw_ostream &OS) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
- if (ExtraCode[1] != 0) return true; // Unknown modifier.
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
switch (ExtraCode[0]) {
default:
@@ -173,45 +190,407 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
return false;
}
+MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
+ MCStreamer &OutStreamer,
+ const MCOperand &Imm, int AlignSize) {
+ MCSymbol *Sym;
+ int64_t Value;
+ if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
+ StringRef sectionPrefix;
+ std::string ImmString;
+ StringRef Name;
+ if (AlignSize == 8) {
+ Name = ".CONST_0000000000000000";
+ sectionPrefix = ".gnu.linkonce.l8";
+ ImmString = utohexstr(Value);
+ } else {
+ Name = ".CONST_00000000";
+ sectionPrefix = ".gnu.linkonce.l4";
+ ImmString = utohexstr(static_cast<uint32_t>(Value));
+ }
+
+ std::string symbolName = // Yes, leading zeros are kept.
+ Name.drop_back(ImmString.size()).str() + ImmString;
+ std::string sectionName = sectionPrefix.str() + symbolName;
+
+ MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+ sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+ OutStreamer.SwitchSection(Section);
+
+ Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
+ if (Sym->isUndefined()) {
+ OutStreamer.EmitLabel(Sym);
+ OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+ OutStreamer.EmitIntValue(Value, AlignSize);
+ OutStreamer.EmitCodeAlignment(AlignSize);
+ }
+ } else {
+ assert(Imm.isExpr() && "Expected expression and found none");
+ const MachineOperand &MO = MI.getOperand(1);
+ assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+ MCSymbol *MOSymbol = nullptr;
+ if (MO.isGlobal())
+ MOSymbol = AP.getSymbol(MO.getGlobal());
+ else if (MO.isCPI())
+ MOSymbol = AP.GetCPISymbol(MO.getIndex());
+ else if (MO.isJTI())
+ MOSymbol = AP.GetJTISymbol(MO.getIndex());
+ else
+ llvm_unreachable("Unknown operand type!");
+
+ StringRef SymbolName = MOSymbol->getName();
+ std::string LitaName = ".CONST_" + SymbolName.str();
+
+ MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+ ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+ OutStreamer.SwitchSection(Section);
+ Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
+ if (Sym->isUndefined()) {
+ OutStreamer.EmitLabel(Sym);
+ OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local);
+ OutStreamer.EmitValue(Imm.getExpr(), AlignSize);
+ OutStreamer.EmitCodeAlignment(AlignSize);
+ }
+ }
+ return Sym;
+}
+
+void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
+ const MachineInstr &MI) {
+ MCInst &MappedInst = static_cast <MCInst &>(Inst);
+ const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ default: return;
+
+ // "$dst = CONST64(#$src1)",
+ case Hexagon::CONST64_Float_Real:
+ case Hexagon::CONST64_Int_Real:
+ if (!OutStreamer->hasRawTextSupport()) {
+ const MCOperand &Imm = MappedInst.getOperand(1);
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+
+ MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8);
+
+ OutStreamer->SwitchSection(Current.first, Current.second);
+ MCInst TmpInst;
+ MCOperand &Reg = MappedInst.getOperand(0);
+ TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+ TmpInst.addOperand(Reg);
+ TmpInst.addOperand(MCOperand::createExpr(
+ MCSymbolRefExpr::create(Sym, OutContext)));
+ MappedInst = TmpInst;
+
+ }
+ break;
+ case Hexagon::CONST32:
+ case Hexagon::CONST32_Float_Real:
+ case Hexagon::CONST32_Int_Real:
+ case Hexagon::FCONST32_nsdata:
+ if (!OutStreamer->hasRawTextSupport()) {
+ MCOperand &Imm = MappedInst.getOperand(1);
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+ MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4);
+ OutStreamer->SwitchSection(Current.first, Current.second);
+ MCInst TmpInst;
+ MCOperand &Reg = MappedInst.getOperand(0);
+ TmpInst.setOpcode(Hexagon::L2_loadrigp);
+ TmpInst.addOperand(Reg);
+ TmpInst.addOperand(MCOperand::createExpr(
+ MCSymbolRefExpr::create(Sym, OutContext)));
+ MappedInst = TmpInst;
+ }
+ break;
+
+ // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use
+ // C2_or during instruction selection itself but it results
+ // into suboptimal code.
+ case Hexagon::C2_pxfer_map: {
+ MCOperand &Ps = Inst.getOperand(1);
+ MappedInst.setOpcode(Hexagon::C2_or);
+ MappedInst.addOperand(Ps);
+ return;
+ }
+
+ // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo
+ // The insn is mapped from the 4 operand to the 3 operand raw form taking
+ // 3 register pairs.
+ case Hexagon::M2_vrcmpys_acc_s1: {
+ MCOperand &Rt = Inst.getOperand(3);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+ case Hexagon::M2_vrcmpys_s1: {
+ MCOperand &Rt = Inst.getOperand(2);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+
+ case Hexagon::M2_vrcmpys_s1rp: {
+ MCOperand &Rt = Inst.getOperand(2);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+ else
+ MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+
+ case Hexagon::A4_boundscheck: {
+ MCOperand &Rs = Inst.getOperand(1);
+ assert (Rs.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rs.getReg());
+ if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+ MappedInst.setOpcode(Hexagon::A4_boundscheck_hi);
+ else // raw:lo
+ MappedInst.setOpcode(Hexagon::A4_boundscheck_lo);
+ Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
+ return;
+ }
+ case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ int64_t Imm;
+ MCExpr const *Expr = MO.getExpr();
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");(void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::S2_vsathub);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ MappedInst = TmpInst;
+ return;
+ }
+ TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(Sub));
+ MappedInst = TmpInst;
+ return;
+ }
+ case Hexagon::S5_vasrhrnd_goodsyntax:
+ case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+ MCOperand &MO2 = MappedInst.getOperand(2);
+ MCExpr const *Expr = MO2.getExpr();
+ int64_t Imm;
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");(void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::A2_combinew);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ MCOperand &MO1 = MappedInst.getOperand(1);
+ unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::subreg_hireg);
+ unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::subreg_loreg);
+ // Add a new operand for the second register in the pair.
+ TmpInst.addOperand(MCOperand::createReg(High));
+ TmpInst.addOperand(MCOperand::createReg(Low));
+ MappedInst = TmpInst;
+ return;
+ }
+
+ if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax)
+ TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+ else
+ TmpInst.setOpcode(Hexagon::S5_vasrhrnd);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(Sub));
+ MappedInst = TmpInst;
+ return;
+ }
+ // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd
+ case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+ MCOperand &MO = Inst.getOperand(2);
+ MCExpr const *Expr = MO.getExpr();
+ int64_t Imm;
+ bool Success = Expr->evaluateAsAbsolute(Imm);
+ assert (Success && "Expected immediate and none was found");(void)Success;
+ MCInst TmpInst;
+ if (Imm == 0) {
+ TmpInst.setOpcode(Hexagon::A2_tfr);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ MappedInst = TmpInst;
+ return;
+ }
+ TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+ TmpInst.addOperand(MappedInst.getOperand(0));
+ TmpInst.addOperand(MappedInst.getOperand(1));
+ const MCExpr *One = MCConstantExpr::create(1, OutContext);
+ const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(Sub));
+ MappedInst = TmpInst;
+ return;
+ }
+ case Hexagon::TFRI_f:
+ MappedInst.setOpcode(Hexagon::A2_tfrsi);
+ return;
+ case Hexagon::TFRI_cPt_f:
+ MappedInst.setOpcode(Hexagon::C2_cmoveit);
+ return;
+ case Hexagon::TFRI_cNotPt_f:
+ MappedInst.setOpcode(Hexagon::C2_cmoveif);
+ return;
+ case Hexagon::MUX_ri_f:
+ MappedInst.setOpcode(Hexagon::C2_muxri);
+ return;
+ case Hexagon::MUX_ir_f:
+ MappedInst.setOpcode(Hexagon::C2_muxir);
+ return;
+
+ // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)"
+ case Hexagon::A2_tfrpi: {
+ MCInst TmpInst;
+ MCOperand &Rdd = MappedInst.getOperand(0);
+ MCOperand &MO = MappedInst.getOperand(1);
+
+ TmpInst.setOpcode(Hexagon::A2_combineii);
+ TmpInst.addOperand(Rdd);
+ int64_t Imm;
+ bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
+ if (Success && Imm < 0) {
+ const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(MOne));
+ } else {
+ const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
+ TmpInst.addOperand(MCOperand::createExpr(Zero));
+ }
+ TmpInst.addOperand(MO);
+ MappedInst = TmpInst;
+ return;
+ }
+ // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+ case Hexagon::A2_tfrp: {
+ MCOperand &MO = MappedInst.getOperand(1);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode(Hexagon::A2_combinew);
+ return;
+ }
+
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+ ? Hexagon::C2_ccombinewt
+ : Hexagon::C2_ccombinewf);
+ return;
+ }
+ case Hexagon::A2_tfrptnew:
+ case Hexagon::A2_tfrpfnew: {
+ MCOperand &MO = MappedInst.getOperand(2);
+ unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg);
+ unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg);
+ MO.setReg(High);
+ // Add a new operand for the second register in the pair.
+ MappedInst.addOperand(MCOperand::createReg(Low));
+ MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+ ? Hexagon::C2_ccombinewnewt
+ : Hexagon::C2_ccombinewnewf);
+ return;
+ }
+
+ case Hexagon::M2_mpysmi: {
+ MCOperand &Imm = MappedInst.getOperand(2);
+ MCExpr const *Expr = Imm.getExpr();
+ int64_t Value;
+ bool Success = Expr->evaluateAsAbsolute(Value);
+ assert(Success);(void)Success;
+ if (Value < 0 && Value > -256) {
+ MappedInst.setOpcode(Hexagon::M2_mpysin);
+ Imm.setExpr(MCUnaryExpr::createMinus(Expr, OutContext));
+ }
+ else
+ MappedInst.setOpcode(Hexagon::M2_mpysip);
+ return;
+ }
+
+ case Hexagon::A2_addsp: {
+ MCOperand &Rt = Inst.getOperand(1);
+ assert (Rt.isReg() && "Expected register and none was found");
+ unsigned Reg = RI->getEncodingValue(Rt.getReg());
+ if (Reg & 1)
+ MappedInst.setOpcode(Hexagon::A2_addsph);
+ else
+ MappedInst.setOpcode(Hexagon::A2_addspl);
+ Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+ return;
+ }
+ case Hexagon::HEXAGON_V6_vd0_pseudo:
+ case Hexagon::HEXAGON_V6_vd0_pseudo_128B: {
+ MCInst TmpInst;
+ assert (Inst.getOperand(0).isReg() &&
+ "Expected register and none was found");
+
+ TmpInst.setOpcode(Hexagon::V6_vxor);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(0));
+ MappedInst = TmpInst;
+ return;
+ }
+
+ }
+}
+
/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
/// the current output stream.
///
void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
- MCInst MCB;
- MCB.setOpcode(Hexagon::BUNDLE);
- MCB.addOperand(MCOperand::createImm(0));
+ MCInst MCB = HexagonMCInstrInfo::createBundle();
+ const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
if (MI->isBundle()) {
const MachineBasicBlock* MBB = MI->getParent();
- MachineBasicBlock::const_instr_iterator MII = MI;
+ MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
unsigned IgnoreCount = 0;
- for (++MII; MII != MBB->end() && MII->isInsideBundle(); ++MII) {
+ for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
++IgnoreCount;
- else {
- HexagonLowerToMC(MII, MCB, *this);
- }
- }
+ else
+ HexagonLowerToMC(MCII, &*MII, MCB, *this);
}
- else {
- HexagonLowerToMC(MI, MCB, *this);
- HexagonMCInstrInfo::padEndloop(MCB);
- }
- // Examine the packet and try to find instructions that can be converted
- // to compounds.
- HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(),
- OutStreamer->getContext(), MCB);
- // Examine the packet and convert pairs of instructions to duplex
- // instructions when possible.
- SmallVector<DuplexCandidate, 8> possibleDuplexes;
- possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(
- *Subtarget->getInstrInfo(), MCB);
- HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget,
- OutStreamer->getContext(), MCB, possibleDuplexes);
- EmitToStreamer(*OutStreamer, MCB);
+ else
+ HexagonLowerToMC(MCII, MI, MCB, *this);
+
+ bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+ MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+ assert(Ok);
+ (void)Ok;
+ if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+ return;
+ OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
}
extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index 792fc8b7af3a..a78d97e28427 100755
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -42,6 +42,10 @@ namespace llvm {
void EmitInstruction(const MachineInstr *MI) override;
+ void HexagonProcessInstruction(MCInst &Inst,
+ const MachineInstr &MBB);
+
+
void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
new file mode 100644
index 000000000000..77907b054d54
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -0,0 +1,2778 @@
+//===--- HexagonBitSimplify.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexbit"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+namespace llvm {
+ void initializeHexagonBitSimplifyPass(PassRegistry& Registry);
+ FunctionPass *createHexagonBitSimplify();
+}
+
+namespace {
+ // Set of virtual registers, based on BitVector.
+ struct RegisterSet : private BitVector {
+ RegisterSet() : BitVector() {}
+ explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+ RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
+
+ using BitVector::clear;
+ using BitVector::count;
+
+ unsigned find_first() const {
+ int First = BitVector::find_first();
+ if (First < 0)
+ return 0;
+ return x2v(First);
+ }
+
+ unsigned find_next(unsigned Prev) const {
+ int Next = BitVector::find_next(v2x(Prev));
+ if (Next < 0)
+ return 0;
+ return x2v(Next);
+ }
+
+ RegisterSet &insert(unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return static_cast<RegisterSet&>(BitVector::set(Idx));
+ }
+ RegisterSet &remove(unsigned R) {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return *this;
+ return static_cast<RegisterSet&>(BitVector::reset(Idx));
+ }
+
+ RegisterSet &insert(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+ }
+ RegisterSet &remove(const RegisterSet &Rs) {
+ return static_cast<RegisterSet&>(BitVector::reset(Rs));
+ }
+
+ reference operator[](unsigned R) {
+ unsigned Idx = v2x(R);
+ ensure(Idx);
+ return BitVector::operator[](Idx);
+ }
+ bool operator[](unsigned R) const {
+ unsigned Idx = v2x(R);
+ assert(Idx < size());
+ return BitVector::operator[](Idx);
+ }
+ bool has(unsigned R) const {
+ unsigned Idx = v2x(R);
+ if (Idx >= size())
+ return false;
+ return BitVector::test(Idx);
+ }
+
+ bool empty() const {
+ return !BitVector::any();
+ }
+ bool includes(const RegisterSet &Rs) const {
+ // A.BitVector::test(B) <=> A-B != {}
+ return !Rs.BitVector::test(*this);
+ }
+ bool intersects(const RegisterSet &Rs) const {
+ return BitVector::anyCommon(Rs);
+ }
+
+ private:
+ void ensure(unsigned Idx) {
+ if (size() <= Idx)
+ resize(std::max(Idx+1, 32U));
+ }
+ static inline unsigned v2x(unsigned v) {
+ return TargetRegisterInfo::virtReg2Index(v);
+ }
+ static inline unsigned x2v(unsigned x) {
+ return TargetRegisterInfo::index2VirtReg(x);
+ }
+ };
+
+
+ struct PrintRegSet {
+ PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+ : RS(S), TRI(RI) {}
+ friend raw_ostream &operator<< (raw_ostream &OS,
+ const PrintRegSet &P);
+ private:
+ const RegisterSet &RS;
+ const TargetRegisterInfo *TRI;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
+ LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+ OS << '{';
+ for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+ OS << ' ' << PrintReg(R, P.TRI);
+ OS << " }";
+ return OS;
+ }
+}
+
+
+namespace {
+ class Transformation;
+
+ class HexagonBitSimplify : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonBitSimplify() : MachineFunctionPass(ID), MDT(0) {
+ initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
+ }
+ virtual const char *getPassName() const {
+ return "Hexagon bit simplification";
+ }
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs);
+ static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses);
+ static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1,
+ const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W);
+ static bool isConst(const BitTracker::RegisterCell &RC, uint16_t B,
+ uint16_t W);
+ static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B,
+ uint16_t W);
+ static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
+ uint16_t W, uint64_t &U);
+ static bool replaceReg(unsigned OldR, unsigned NewR,
+ MachineRegisterInfo &MRI);
+ static bool getSubregMask(const BitTracker::RegisterRef &RR,
+ unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
+ static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
+ unsigned NewSR, MachineRegisterInfo &MRI);
+ static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
+ unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+ static bool parseRegSequence(const MachineInstr &I,
+ BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH);
+
+ static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+ uint16_t Begin);
+ static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits,
+ uint16_t Begin, const HexagonInstrInfo &HII);
+
+ static const TargetRegisterClass *getFinalVRegClass(
+ const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI);
+ static bool isTransparentCopy(const BitTracker::RegisterRef &RD,
+ const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI);
+
+ private:
+ MachineDominatorTree *MDT;
+
+ bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs);
+ };
+
+ char HexagonBitSimplify::ID = 0;
+ typedef HexagonBitSimplify HBS;
+
+
+ // The purpose of this class is to provide a common facility to traverse
+ // the function top-down or bottom-up via the dominator tree, and keep
+ // track of the available registers.
+ class Transformation {
+ public:
+ bool TopDown;
+ Transformation(bool TD) : TopDown(TD) {}
+ virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0;
+ virtual ~Transformation() {}
+ };
+}
+
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+ "Hexagon bit simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+ "Hexagon bit simplification", false, false)
+
+
+bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
+ RegisterSet &AVs) {
+ MachineDomTreeNode *N = MDT->getNode(&B);
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ bool Changed = false;
+
+ if (T.TopDown)
+ Changed = T.processBlock(B, AVs);
+
+ RegisterSet Defs;
+ for (auto &I : B)
+ getInstrDefs(I, Defs);
+ RegisterSet NewAVs = AVs;
+ NewAVs.insert(Defs);
+
+ for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ Changed |= visitBlock(*SB, T, NewAVs);
+ }
+ if (!T.TopDown)
+ Changed |= T.processBlock(B, AVs);
+
+ return Changed;
+}
+
+//
+// Utility functions:
+//
+void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
+ RegisterSet &Defs) {
+ for (auto &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Defs.insert(R);
+ }
+}
+
+void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
+ RegisterSet &Uses) {
+ for (auto &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ Uses.insert(R);
+ }
+}
+
+// Check if all the bits in range [B, E) in both cells are equal.
+bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1,
+ uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2,
+ uint16_t W) {
+ for (uint16_t i = 0; i < W; ++i) {
+ // If RC1[i] is "bottom", it cannot be proven equal to RC2[i].
+ if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0)
+ return false;
+ // Same for RC2[i].
+ if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0)
+ return false;
+ if (RC1[B1+i] != RC2[B2+i])
+ return false;
+ }
+ return true;
+}
+
+
+bool HexagonBitSimplify::isConst(const BitTracker::RegisterCell &RC,
+ uint16_t B, uint16_t W) {
+ assert(B < RC.width() && B+W <= RC.width());
+ for (uint16_t i = B; i < B+W; ++i)
+ if (!RC[i].num())
+ return false;
+ return true;
+}
+
+
+bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC,
+ uint16_t B, uint16_t W) {
+ assert(B < RC.width() && B+W <= RC.width());
+ for (uint16_t i = B; i < B+W; ++i)
+ if (!RC[i].is(0))
+ return false;
+ return true;
+}
+
+
+bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
+ uint16_t B, uint16_t W, uint64_t &U) {
+ assert(B < RC.width() && B+W <= RC.width());
+ int64_t T = 0;
+ for (uint16_t i = B+W; i > B; --i) {
+ const BitTracker::BitValue &BV = RC[i-1];
+ T <<= 1;
+ if (BV.is(1))
+ T |= 1;
+ else if (!BV.is(0))
+ return false;
+ }
+ U = T;
+ return true;
+}
+
+
+bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
+ MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ I->setReg(NewR);
+ }
+ return Begin != End;
+}
+
+
+bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
+ unsigned NewSR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ I->setReg(NewR);
+ I->setSubReg(NewSR);
+ }
+ return Begin != End;
+}
+
+
+bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
+ unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+ !TargetRegisterInfo::isVirtualRegister(NewR))
+ return false;
+ auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+ decltype(End) NextI;
+ for (auto I = Begin; I != End; I = NextI) {
+ NextI = std::next(I);
+ if (I->getSubReg() != OldSR)
+ continue;
+ I->setReg(NewR);
+ I->setSubReg(NewSR);
+ }
+ return Begin != End;
+}
+
+
+// For a register ref (pair Reg:Sub), set Begin to the position of the LSB
+// of Sub in Reg, and set Width to the size of Sub in bits. Return true,
+// if this succeeded, otherwise return false.
+bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
+ unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
+ if (RC == &Hexagon::IntRegsRegClass) {
+ assert(RR.Sub == 0);
+ Begin = 0;
+ Width = 32;
+ return true;
+ }
+ if (RC == &Hexagon::DoubleRegsRegClass) {
+ if (RR.Sub == 0) {
+ Begin = 0;
+ Width = 64;
+ return true;
+ }
+ assert(RR.Sub == Hexagon::subreg_loreg || RR.Sub == Hexagon::subreg_hireg);
+ Width = 32;
+ Begin = (RR.Sub == Hexagon::subreg_loreg ? 0 : 32);
+ return true;
+ }
+ return false;
+}
+
+
+// For a REG_SEQUENCE, set SL to the low subregister and SH to the high
+// subregister.
+bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I,
+ BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH) {
+ assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE);
+ unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm();
+ assert(Sub1 != Sub2);
+ if (Sub1 == Hexagon::subreg_loreg && Sub2 == Hexagon::subreg_hireg) {
+ SL = I.getOperand(1);
+ SH = I.getOperand(3);
+ return true;
+ }
+ if (Sub1 == Hexagon::subreg_hireg && Sub2 == Hexagon::subreg_loreg) {
+ SH = I.getOperand(1);
+ SL = I.getOperand(3);
+ return true;
+ }
+ return false;
+}
+
+
+// All stores (except 64-bit stores) take a 32-bit register as the source
+// of the value to be stored. If the instruction stores into a location
+// that is shorter than 32 bits, some bits of the source register are not
+// used. For each store instruction, calculate the set of used bits in
+// the source register, and set appropriate bits in Bits. Return true if
+// the bits are calculated, false otherwise.
+bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+ uint16_t Begin) {
+ using namespace Hexagon;
+
+ switch (Opc) {
+ // Store byte
+ case S2_storerb_io: // memb(Rs32+#s11:0)=Rt32
+ case S2_storerbnew_io: // memb(Rs32+#s11:0)=Nt8.new
+ case S2_pstorerbt_io: // if (Pv4) memb(Rs32+#u6:0)=Rt32
+ case S2_pstorerbf_io: // if (!Pv4) memb(Rs32+#u6:0)=Rt32
+ case S4_pstorerbtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Rt32
+ case S4_pstorerbfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32
+ case S2_pstorerbnewt_io: // if (Pv4) memb(Rs32+#u6:0)=Nt8.new
+ case S2_pstorerbnewf_io: // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new
+ case S4_pstorerbnewtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+ case S4_pstorerbnewfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+ case S2_storerb_pi: // memb(Rx32++#s4:0)=Rt32
+ case S2_storerbnew_pi: // memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbt_pi: // if (Pv4) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32
+ case S2_pstorerbnewt_pi: // if (Pv4) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+ case S2_pstorerbnewfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+ case S4_storerb_ap: // memb(Re32=#U6)=Rt32
+ case S4_storerbnew_ap: // memb(Re32=#U6)=Nt8.new
+ case S2_storerb_pr: // memb(Rx32++Mu2)=Rt32
+ case S2_storerbnew_pr: // memb(Rx32++Mu2)=Nt8.new
+ case S4_storerb_ur: // memb(Ru32<<#u2+#U6)=Rt32
+ case S4_storerbnew_ur: // memb(Ru32<<#u2+#U6)=Nt8.new
+ case S2_storerb_pbr: // memb(Rx32++Mu2:brev)=Rt32
+ case S2_storerbnew_pbr: // memb(Rx32++Mu2:brev)=Nt8.new
+ case S2_storerb_pci: // memb(Rx32++#s4:0:circ(Mu2))=Rt32
+ case S2_storerbnew_pci: // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new
+ case S2_storerb_pcr: // memb(Rx32++I:circ(Mu2))=Rt32
+ case S2_storerbnew_pcr: // memb(Rx32++I:circ(Mu2))=Nt8.new
+ case S4_storerb_rr: // memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_storerbnew_rr: // memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerbnewt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerbnewfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+ case S2_storerbgp: // memb(gp+#u16:0)=Rt32
+ case S2_storerbnewgp: // memb(gp+#u16:0)=Nt8.new
+ case S4_pstorerbt_abs: // if (Pv4) memb(#u6)=Rt32
+ case S4_pstorerbf_abs: // if (!Pv4) memb(#u6)=Rt32
+ case S4_pstorerbtnew_abs: // if (Pv4.new) memb(#u6)=Rt32
+ case S4_pstorerbfnew_abs: // if (!Pv4.new) memb(#u6)=Rt32
+ case S4_pstorerbnewt_abs: // if (Pv4) memb(#u6)=Nt8.new
+ case S4_pstorerbnewf_abs: // if (!Pv4) memb(#u6)=Nt8.new
+ case S4_pstorerbnewtnew_abs: // if (Pv4.new) memb(#u6)=Nt8.new
+ case S4_pstorerbnewfnew_abs: // if (!Pv4.new) memb(#u6)=Nt8.new
+ Bits.set(Begin, Begin+8);
+ return true;
+
+ // Store low half
+ case S2_storerh_io: // memh(Rs32+#s11:1)=Rt32
+ case S2_storerhnew_io: // memh(Rs32+#s11:1)=Nt8.new
+ case S2_pstorerht_io: // if (Pv4) memh(Rs32+#u6:1)=Rt32
+ case S2_pstorerhf_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt32
+ case S4_pstorerhtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt32
+ case S4_pstorerhfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32
+ case S2_pstorerhnewt_io: // if (Pv4) memh(Rs32+#u6:1)=Nt8.new
+ case S2_pstorerhnewf_io: // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new
+ case S4_pstorerhnewtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+ case S4_pstorerhnewfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+ case S2_storerh_pi: // memh(Rx32++#s4:1)=Rt32
+ case S2_storerhnew_pi: // memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerht_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32
+ case S2_pstorerhnewt_pi: // if (Pv4) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+ case S2_pstorerhnewfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+ case S4_storerh_ap: // memh(Re32=#U6)=Rt32
+ case S4_storerhnew_ap: // memh(Re32=#U6)=Nt8.new
+ case S2_storerh_pr: // memh(Rx32++Mu2)=Rt32
+ case S2_storerhnew_pr: // memh(Rx32++Mu2)=Nt8.new
+ case S4_storerh_ur: // memh(Ru32<<#u2+#U6)=Rt32
+ case S4_storerhnew_ur: // memh(Ru32<<#u2+#U6)=Nt8.new
+ case S2_storerh_pbr: // memh(Rx32++Mu2:brev)=Rt32
+ case S2_storerhnew_pbr: // memh(Rx32++Mu2:brev)=Nt8.new
+ case S2_storerh_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt32
+ case S2_storerhnew_pci: // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new
+ case S2_storerh_pcr: // memh(Rx32++I:circ(Mu2))=Rt32
+ case S2_storerhnew_pcr: // memh(Rx32++I:circ(Mu2))=Nt8.new
+ case S4_storerh_rr: // memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerht_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_pstorerhfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+ case S4_storerhnew_rr: // memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewt_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S4_pstorerhnewfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+ case S2_storerhgp: // memh(gp+#u16:1)=Rt32
+ case S2_storerhnewgp: // memh(gp+#u16:1)=Nt8.new
+ case S4_pstorerht_abs: // if (Pv4) memh(#u6)=Rt32
+ case S4_pstorerhf_abs: // if (!Pv4) memh(#u6)=Rt32
+ case S4_pstorerhtnew_abs: // if (Pv4.new) memh(#u6)=Rt32
+ case S4_pstorerhfnew_abs: // if (!Pv4.new) memh(#u6)=Rt32
+ case S4_pstorerhnewt_abs: // if (Pv4) memh(#u6)=Nt8.new
+ case S4_pstorerhnewf_abs: // if (!Pv4) memh(#u6)=Nt8.new
+ case S4_pstorerhnewtnew_abs: // if (Pv4.new) memh(#u6)=Nt8.new
+ case S4_pstorerhnewfnew_abs: // if (!Pv4.new) memh(#u6)=Nt8.new
+ Bits.set(Begin, Begin+16);
+ return true;
+
+ // Store high half
+ case S2_storerf_io: // memh(Rs32+#s11:1)=Rt.H32
+ case S2_pstorerft_io: // if (Pv4) memh(Rs32+#u6:1)=Rt.H32
+ case S2_pstorerff_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32
+ case S4_pstorerftnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+ case S4_pstorerffnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+ case S2_storerf_pi: // memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerft_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerff_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerftnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+ case S2_pstorerffnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+ case S4_storerf_ap: // memh(Re32=#U6)=Rt.H32
+ case S2_storerf_pr: // memh(Rx32++Mu2)=Rt.H32
+ case S4_storerf_ur: // memh(Ru32<<#u2+#U6)=Rt.H32
+ case S2_storerf_pbr: // memh(Rx32++Mu2:brev)=Rt.H32
+ case S2_storerf_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32
+ case S2_storerf_pcr: // memh(Rx32++I:circ(Mu2))=Rt.H32
+ case S4_storerf_rr: // memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerft_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerff_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerftnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S4_pstorerffnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+ case S2_storerfgp: // memh(gp+#u16:1)=Rt.H32
+ case S4_pstorerft_abs: // if (Pv4) memh(#u6)=Rt.H32
+ case S4_pstorerff_abs: // if (!Pv4) memh(#u6)=Rt.H32
+ case S4_pstorerftnew_abs: // if (Pv4.new) memh(#u6)=Rt.H32
+ case S4_pstorerffnew_abs: // if (!Pv4.new) memh(#u6)=Rt.H32
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+
+ return false;
+}
+
+
+// For an instruction with opcode Opc, calculate the set of bits that it
+// uses in a register in operand OpN. This only calculates the set of used
+// bits for cases where it does not depend on any operands (as is the case
+// in shifts, for example). For concrete instructions from a program, the
+// operand may be a subregister of a larger register, while Bits would
+// correspond to the larger register in its entirety. Because of that,
+// the parameter Begin can be used to indicate which bit of Bits should be
+// considered the LSB of of the operand.
+bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
+ BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
+ using namespace Hexagon;
+
+ const MCInstrDesc &D = HII.get(Opc);
+ if (D.mayStore()) {
+ if (OpN == D.getNumOperands()-1)
+ return getUsedBitsInStore(Opc, Bits, Begin);
+ return false;
+ }
+
+ switch (Opc) {
+ // One register source. Used bits: R1[0-7].
+ case A2_sxtb:
+ case A2_zxtb:
+ case A4_cmpbeqi:
+ case A4_cmpbgti:
+ case A4_cmpbgtui:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+8);
+ return true;
+ }
+ break;
+
+ // One register source. Used bits: R1[0-15].
+ case A2_aslh:
+ case A2_sxth:
+ case A2_zxth:
+ case A4_cmpheqi:
+ case A4_cmphgti:
+ case A4_cmphgtui:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // One register source. Used bits: R1[16-31].
+ case A2_asrh:
+ if (OpN == 1) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-7], R2[0-7].
+ case A4_cmpbeq:
+ case A4_cmpbgt:
+ case A4_cmpbgtu:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+8);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-15], R2[0-15].
+ case A4_cmpheq:
+ case A4_cmphgt:
+ case A4_cmphgtu:
+ case A2_addh_h16_ll:
+ case A2_addh_h16_sat_ll:
+ case A2_addh_l16_ll:
+ case A2_addh_l16_sat_ll:
+ case A2_combine_ll:
+ case A2_subh_h16_ll:
+ case A2_subh_h16_sat_ll:
+ case A2_subh_l16_ll:
+ case A2_subh_l16_sat_ll:
+ case M2_mpy_acc_ll_s0:
+ case M2_mpy_acc_ll_s1:
+ case M2_mpy_acc_sat_ll_s0:
+ case M2_mpy_acc_sat_ll_s1:
+ case M2_mpy_ll_s0:
+ case M2_mpy_ll_s1:
+ case M2_mpy_nac_ll_s0:
+ case M2_mpy_nac_ll_s1:
+ case M2_mpy_nac_sat_ll_s0:
+ case M2_mpy_nac_sat_ll_s1:
+ case M2_mpy_rnd_ll_s0:
+ case M2_mpy_rnd_ll_s1:
+ case M2_mpy_sat_ll_s0:
+ case M2_mpy_sat_ll_s1:
+ case M2_mpy_sat_rnd_ll_s0:
+ case M2_mpy_sat_rnd_ll_s1:
+ case M2_mpyd_acc_ll_s0:
+ case M2_mpyd_acc_ll_s1:
+ case M2_mpyd_ll_s0:
+ case M2_mpyd_ll_s1:
+ case M2_mpyd_nac_ll_s0:
+ case M2_mpyd_nac_ll_s1:
+ case M2_mpyd_rnd_ll_s0:
+ case M2_mpyd_rnd_ll_s1:
+ case M2_mpyu_acc_ll_s0:
+ case M2_mpyu_acc_ll_s1:
+ case M2_mpyu_ll_s0:
+ case M2_mpyu_ll_s1:
+ case M2_mpyu_nac_ll_s0:
+ case M2_mpyu_nac_ll_s1:
+ case M2_mpyud_acc_ll_s0:
+ case M2_mpyud_acc_ll_s1:
+ case M2_mpyud_ll_s0:
+ case M2_mpyud_ll_s1:
+ case M2_mpyud_nac_ll_s0:
+ case M2_mpyud_nac_ll_s1:
+ if (OpN == 1 || OpN == 2) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // Two register sources. Used bits: R1[0-15], R2[16-31].
+ case A2_addh_h16_lh:
+ case A2_addh_h16_sat_lh:
+ case A2_combine_lh:
+ case A2_subh_h16_lh:
+ case A2_subh_h16_sat_lh:
+ case M2_mpy_acc_lh_s0:
+ case M2_mpy_acc_lh_s1:
+ case M2_mpy_acc_sat_lh_s0:
+ case M2_mpy_acc_sat_lh_s1:
+ case M2_mpy_lh_s0:
+ case M2_mpy_lh_s1:
+ case M2_mpy_nac_lh_s0:
+ case M2_mpy_nac_lh_s1:
+ case M2_mpy_nac_sat_lh_s0:
+ case M2_mpy_nac_sat_lh_s1:
+ case M2_mpy_rnd_lh_s0:
+ case M2_mpy_rnd_lh_s1:
+ case M2_mpy_sat_lh_s0:
+ case M2_mpy_sat_lh_s1:
+ case M2_mpy_sat_rnd_lh_s0:
+ case M2_mpy_sat_rnd_lh_s1:
+ case M2_mpyd_acc_lh_s0:
+ case M2_mpyd_acc_lh_s1:
+ case M2_mpyd_lh_s0:
+ case M2_mpyd_lh_s1:
+ case M2_mpyd_nac_lh_s0:
+ case M2_mpyd_nac_lh_s1:
+ case M2_mpyd_rnd_lh_s0:
+ case M2_mpyd_rnd_lh_s1:
+ case M2_mpyu_acc_lh_s0:
+ case M2_mpyu_acc_lh_s1:
+ case M2_mpyu_lh_s0:
+ case M2_mpyu_lh_s1:
+ case M2_mpyu_nac_lh_s0:
+ case M2_mpyu_nac_lh_s1:
+ case M2_mpyud_acc_lh_s0:
+ case M2_mpyud_acc_lh_s1:
+ case M2_mpyud_lh_s0:
+ case M2_mpyud_lh_s1:
+ case M2_mpyud_nac_lh_s0:
+ case M2_mpyud_nac_lh_s1:
+ // These four are actually LH.
+ case A2_addh_l16_hl:
+ case A2_addh_l16_sat_hl:
+ case A2_subh_l16_hl:
+ case A2_subh_l16_sat_hl:
+ if (OpN == 1) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ if (OpN == 2) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+
+ // Two register sources, used bits: R1[16-31], R2[0-15].
+ case A2_addh_h16_hl:
+ case A2_addh_h16_sat_hl:
+ case A2_combine_hl:
+ case A2_subh_h16_hl:
+ case A2_subh_h16_sat_hl:
+ case M2_mpy_acc_hl_s0:
+ case M2_mpy_acc_hl_s1:
+ case M2_mpy_acc_sat_hl_s0:
+ case M2_mpy_acc_sat_hl_s1:
+ case M2_mpy_hl_s0:
+ case M2_mpy_hl_s1:
+ case M2_mpy_nac_hl_s0:
+ case M2_mpy_nac_hl_s1:
+ case M2_mpy_nac_sat_hl_s0:
+ case M2_mpy_nac_sat_hl_s1:
+ case M2_mpy_rnd_hl_s0:
+ case M2_mpy_rnd_hl_s1:
+ case M2_mpy_sat_hl_s0:
+ case M2_mpy_sat_hl_s1:
+ case M2_mpy_sat_rnd_hl_s0:
+ case M2_mpy_sat_rnd_hl_s1:
+ case M2_mpyd_acc_hl_s0:
+ case M2_mpyd_acc_hl_s1:
+ case M2_mpyd_hl_s0:
+ case M2_mpyd_hl_s1:
+ case M2_mpyd_nac_hl_s0:
+ case M2_mpyd_nac_hl_s1:
+ case M2_mpyd_rnd_hl_s0:
+ case M2_mpyd_rnd_hl_s1:
+ case M2_mpyu_acc_hl_s0:
+ case M2_mpyu_acc_hl_s1:
+ case M2_mpyu_hl_s0:
+ case M2_mpyu_hl_s1:
+ case M2_mpyu_nac_hl_s0:
+ case M2_mpyu_nac_hl_s1:
+ case M2_mpyud_acc_hl_s0:
+ case M2_mpyud_acc_hl_s1:
+ case M2_mpyud_hl_s0:
+ case M2_mpyud_hl_s1:
+ case M2_mpyud_nac_hl_s0:
+ case M2_mpyud_nac_hl_s1:
+ if (OpN == 1) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ if (OpN == 2) {
+ Bits.set(Begin, Begin+16);
+ return true;
+ }
+ break;
+
+ // Two register sources, used bits: R1[16-31], R2[16-31].
+ case A2_addh_h16_hh:
+ case A2_addh_h16_sat_hh:
+ case A2_combine_hh:
+ case A2_subh_h16_hh:
+ case A2_subh_h16_sat_hh:
+ case M2_mpy_acc_hh_s0:
+ case M2_mpy_acc_hh_s1:
+ case M2_mpy_acc_sat_hh_s0:
+ case M2_mpy_acc_sat_hh_s1:
+ case M2_mpy_hh_s0:
+ case M2_mpy_hh_s1:
+ case M2_mpy_nac_hh_s0:
+ case M2_mpy_nac_hh_s1:
+ case M2_mpy_nac_sat_hh_s0:
+ case M2_mpy_nac_sat_hh_s1:
+ case M2_mpy_rnd_hh_s0:
+ case M2_mpy_rnd_hh_s1:
+ case M2_mpy_sat_hh_s0:
+ case M2_mpy_sat_hh_s1:
+ case M2_mpy_sat_rnd_hh_s0:
+ case M2_mpy_sat_rnd_hh_s1:
+ case M2_mpyd_acc_hh_s0:
+ case M2_mpyd_acc_hh_s1:
+ case M2_mpyd_hh_s0:
+ case M2_mpyd_hh_s1:
+ case M2_mpyd_nac_hh_s0:
+ case M2_mpyd_nac_hh_s1:
+ case M2_mpyd_rnd_hh_s0:
+ case M2_mpyd_rnd_hh_s1:
+ case M2_mpyu_acc_hh_s0:
+ case M2_mpyu_acc_hh_s1:
+ case M2_mpyu_hh_s0:
+ case M2_mpyu_hh_s1:
+ case M2_mpyu_nac_hh_s0:
+ case M2_mpyu_nac_hh_s1:
+ case M2_mpyud_acc_hh_s0:
+ case M2_mpyud_acc_hh_s1:
+ case M2_mpyud_hh_s0:
+ case M2_mpyud_hh_s1:
+ case M2_mpyud_nac_hh_s0:
+ case M2_mpyud_nac_hh_s1:
+ if (OpN == 1 || OpN == 2) {
+ Bits.set(Begin+16, Begin+32);
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+
+// Calculate the register class that matches Reg:Sub. For example, if
+// vreg1 is a double register, then vreg1:subreg_hireg would match "int"
+// register class.
+const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
+ const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ return nullptr;
+ auto *RC = MRI.getRegClass(RR.Reg);
+ if (RR.Sub == 0)
+ return RC;
+
+ auto VerifySR = [] (unsigned Sub) -> void {
+ assert(Sub == Hexagon::subreg_hireg || Sub == Hexagon::subreg_loreg);
+ };
+
+ switch (RC->getID()) {
+ case Hexagon::DoubleRegsRegClassID:
+ VerifySR(RR.Sub);
+ return &Hexagon::IntRegsRegClass;
+ }
+ return nullptr;
+}
+
+
+// Check if RD could be replaced with RS at any possible use of RD.
+// For example a predicate register cannot be replaced with a integer
+// register, but a 64-bit register with a subregister can be replaced
+// with a 32-bit register.
+bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
+ const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
+ !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+ return false;
+ // Return false if one (or both) classes are nullptr.
+ auto *DRC = getFinalVRegClass(RD, MRI);
+ if (!DRC)
+ return false;
+
+ return DRC == getFinalVRegClass(RS, MRI);
+}
+
+
+//
+// Dead code elimination
+//
+namespace {
+ class DeadCodeElimination {
+ public:
+ DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt)
+ : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+ MDT(mdt), MRI(mf.getRegInfo()) {}
+
+ bool run() {
+ return runOnNode(MDT.getRootNode());
+ }
+
+ private:
+ bool isDead(unsigned R) const;
+ bool runOnNode(MachineDomTreeNode *N);
+
+ MachineFunction &MF;
+ const HexagonInstrInfo &HII;
+ MachineDominatorTree &MDT;
+ MachineRegisterInfo &MRI;
+ };
+}
+
+
+bool DeadCodeElimination::isDead(unsigned R) const {
+ for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+ MachineInstr *UseI = I->getParent();
+ if (UseI->isDebugValue())
+ continue;
+ if (UseI->isPHI()) {
+ assert(!UseI->getOperand(0).getSubReg());
+ unsigned DR = UseI->getOperand(0).getReg();
+ if (DR == R)
+ continue;
+ }
+ return false;
+ }
+ return true;
+}
+
+
+bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
+ bool Changed = false;
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+ Changed |= runOnNode(*I);
+
+ MachineBasicBlock *B = N->getBlock();
+ std::vector<MachineInstr*> Instrs;
+ for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+ Instrs.push_back(&*I);
+
+ for (auto MI : Instrs) {
+ unsigned Opc = MI->getOpcode();
+ // Do not touch lifetime markers. This is why the target-independent DCE
+ // cannot be used.
+ if (Opc == TargetOpcode::LIFETIME_START ||
+ Opc == TargetOpcode::LIFETIME_END)
+ continue;
+ bool Store = false;
+ if (MI->isInlineAsm())
+ continue;
+ // Delete PHIs if possible.
+ if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+ continue;
+
+ bool AllDead = true;
+ SmallVector<unsigned,2> Regs;
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+ AllDead = false;
+ break;
+ }
+ Regs.push_back(R);
+ }
+ if (!AllDead)
+ continue;
+
+ B->erase(MI);
+ for (unsigned i = 0, n = Regs.size(); i != n; ++i)
+ MRI.markUsesInDebugValueAsUndef(Regs[i]);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+
+//
+// Eliminate redundant instructions
+//
+// This transformation will identify instructions where the output register
+// is the same as one of its input registers. This only works on instructions
+// that define a single register (unlike post-increment loads, for example).
+// The equality check is actually more detailed: the code calculates which
+// bits of the output are used, and only compares these bits with the input
+// registers.
+// If the output matches an input, the instruction is replaced with COPY.
+// The copies will be removed by another transformation.
+namespace {
+ class RedundantInstrElimination : public Transformation {
+ public:
+ RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ private:
+ bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN,
+ unsigned &LostB, unsigned &LostE);
+ bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN,
+ unsigned &LostB, unsigned &LostE);
+ bool computeUsedBits(unsigned Reg, BitVector &Bits);
+ bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits,
+ uint16_t Begin);
+ bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+}
+
+
+// Check if the instruction is a lossy shift left, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI,
+ unsigned OpN, unsigned &LostB, unsigned &LostE) {
+ using namespace Hexagon;
+ unsigned Opc = MI.getOpcode();
+ unsigned ImN, RegN, Width;
+ switch (Opc) {
+ case S2_asl_i_p:
+ ImN = 2;
+ RegN = 1;
+ Width = 64;
+ break;
+ case S2_asl_i_p_acc:
+ case S2_asl_i_p_and:
+ case S2_asl_i_p_nac:
+ case S2_asl_i_p_or:
+ case S2_asl_i_p_xacc:
+ ImN = 3;
+ RegN = 2;
+ Width = 64;
+ break;
+ case S2_asl_i_r:
+ ImN = 2;
+ RegN = 1;
+ Width = 32;
+ break;
+ case S2_addasl_rrri:
+ case S4_andi_asl_ri:
+ case S4_ori_asl_ri:
+ case S4_addi_asl_ri:
+ case S4_subi_asl_ri:
+ case S2_asl_i_r_acc:
+ case S2_asl_i_r_and:
+ case S2_asl_i_r_nac:
+ case S2_asl_i_r_or:
+ case S2_asl_i_r_sat:
+ case S2_asl_i_r_xacc:
+ ImN = 3;
+ RegN = 2;
+ Width = 32;
+ break;
+ default:
+ return false;
+ }
+
+ if (RegN != OpN)
+ return false;
+
+ assert(MI.getOperand(ImN).isImm());
+ unsigned S = MI.getOperand(ImN).getImm();
+ if (S == 0)
+ return false;
+ LostB = Width-S;
+ LostE = Width;
+ return true;
+}
+
+
+// Check if the instruction is a lossy shift right, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI,
+ unsigned OpN, unsigned &LostB, unsigned &LostE) {
+ using namespace Hexagon;
+ unsigned Opc = MI.getOpcode();
+ unsigned ImN, RegN;
+ switch (Opc) {
+ case S2_asr_i_p:
+ case S2_lsr_i_p:
+ ImN = 2;
+ RegN = 1;
+ break;
+ case S2_asr_i_p_acc:
+ case S2_asr_i_p_and:
+ case S2_asr_i_p_nac:
+ case S2_asr_i_p_or:
+ case S2_lsr_i_p_acc:
+ case S2_lsr_i_p_and:
+ case S2_lsr_i_p_nac:
+ case S2_lsr_i_p_or:
+ case S2_lsr_i_p_xacc:
+ ImN = 3;
+ RegN = 2;
+ break;
+ case S2_asr_i_r:
+ case S2_lsr_i_r:
+ ImN = 2;
+ RegN = 1;
+ break;
+ case S4_andi_lsr_ri:
+ case S4_ori_lsr_ri:
+ case S4_addi_lsr_ri:
+ case S4_subi_lsr_ri:
+ case S2_asr_i_r_acc:
+ case S2_asr_i_r_and:
+ case S2_asr_i_r_nac:
+ case S2_asr_i_r_or:
+ case S2_lsr_i_r_acc:
+ case S2_lsr_i_r_and:
+ case S2_lsr_i_r_nac:
+ case S2_lsr_i_r_or:
+ case S2_lsr_i_r_xacc:
+ ImN = 3;
+ RegN = 2;
+ break;
+
+ default:
+ return false;
+ }
+
+ if (RegN != OpN)
+ return false;
+
+ assert(MI.getOperand(ImN).isImm());
+ unsigned S = MI.getOperand(ImN).getImm();
+ LostB = 0;
+ LostE = S;
+ return true;
+}
+
+
+// Calculate the bit vector that corresponds to the used bits of register Reg.
+// The vector Bits has the same size, as the size of Reg in bits. If the cal-
+// culation fails (i.e. the used bits are unknown), it returns false. Other-
+// wise, it returns true and sets the corresponding bits in Bits.
+bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
+ BitVector Used(Bits.size());
+ RegisterSet Visited;
+ std::vector<unsigned> Pending;
+ Pending.push_back(Reg);
+
+ for (unsigned i = 0; i < Pending.size(); ++i) {
+ unsigned R = Pending[i];
+ if (Visited.has(R))
+ continue;
+ Visited.insert(R);
+ for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+ BitTracker::RegisterRef UR = *I;
+ unsigned B, W;
+ if (!HBS::getSubregMask(UR, B, W, MRI))
+ return false;
+ MachineInstr &UseI = *I->getParent();
+ if (UseI.isPHI() || UseI.isCopy()) {
+ unsigned DefR = UseI.getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ return false;
+ Pending.push_back(DefR);
+ } else {
+ if (!computeUsedBits(UseI, I.getOperandNo(), Used, B))
+ return false;
+ }
+ }
+ }
+ Bits |= Used;
+ return true;
+}
+
+
+// Calculate the bits used by instruction MI in a register in operand OpN.
+// Return true/false if the calculation succeeds/fails. If is succeeds, set
+// used bits in Bits. This function does not reset any bits in Bits, so
+// subsequent calls over different instructions will result in the union
+// of the used bits in all these instructions.
+// The register in question may be used with a sub-register, whereas Bits
+// holds the bits for the entire register. To keep track of that, the
+// argument Begin indicates where in Bits is the lowest-significant bit
+// of the register used in operand OpN. For example, in instruction:
+// vreg1 = S2_lsr_i_r vreg2:subreg_hireg, 10
+// the operand 1 is a 32-bit register, which happens to be a subregister
+// of the 64-bit register vreg2, and that subregister starts at position 32.
+// In this case Begin=32, since Bits[32] would be the lowest-significant bit
+// of vreg2:subreg_hireg.
+bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
+ unsigned OpN, BitVector &Bits, uint16_t Begin) {
+ unsigned Opc = MI.getOpcode();
+ BitVector T(Bits.size());
+ bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII);
+ // Even if we don't have bits yet, we could still provide some information
+ // if the instruction is a lossy shift: the lost bits will be marked as
+ // not used.
+ unsigned LB, LE;
+ if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) {
+ assert(MI.getOperand(OpN).isReg());
+ BitTracker::RegisterRef RR = MI.getOperand(OpN);
+ const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
+ uint16_t Width = RC->getSize()*8;
+
+ if (!GotBits)
+ T.set(Begin, Begin+Width);
+ assert(LB <= LE && LB < Width && LE <= Width);
+ T.reset(Begin+LB, Begin+LE);
+ GotBits = true;
+ }
+ if (GotBits)
+ Bits |= T;
+ return GotBits;
+}
+
+
+// Calculates the used bits in RD ("defined register"), and checks if these
+// bits in RS ("used register") and RD are identical.
+bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD,
+ BitTracker::RegisterRef RS) {
+ const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+
+ unsigned DB, DW;
+ if (!HBS::getSubregMask(RD, DB, DW, MRI))
+ return false;
+ unsigned SB, SW;
+ if (!HBS::getSubregMask(RS, SB, SW, MRI))
+ return false;
+ if (SW != DW)
+ return false;
+
+ BitVector Used(DC.width());
+ if (!computeUsedBits(RD.Reg, Used))
+ return false;
+
+ for (unsigned i = 0; i != DW; ++i)
+ if (Used[i+DB] && DC[DB+i] != SC[SB+i])
+ return false;
+ return true;
+}
+
+
+bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
+ const RegisterSet&) {
+ bool Changed = false;
+
+ for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) {
+ NextI = std::next(I);
+ MachineInstr *MI = &*I;
+
+ if (MI->getOpcode() == TargetOpcode::COPY)
+ continue;
+ if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+ continue;
+ unsigned NumD = MI->getDesc().getNumDefs();
+ if (NumD != 1)
+ continue;
+
+ BitTracker::RegisterRef RD = MI->getOperand(0);
+ if (!BT.has(RD.Reg))
+ continue;
+ const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+
+ // Find a source operand that is equal to the result.
+ for (auto &Op : MI->uses()) {
+ if (!Op.isReg())
+ continue;
+ BitTracker::RegisterRef RS = Op;
+ if (!BT.has(RS.Reg))
+ continue;
+ if (!HBS::isTransparentCopy(RD, RS, MRI))
+ continue;
+
+ unsigned BN, BW;
+ if (!HBS::getSubregMask(RS, BN, BW, MRI))
+ continue;
+
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW))
+ continue;
+
+ // If found, replace the instruction with a COPY.
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+ unsigned NewR = MRI.createVirtualRegister(FRC);
+ BuildMI(B, I, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(RS.Reg, 0, RS.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), SC);
+ Changed = true;
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+
+//
+// Const generation
+//
+// Recognize instructions that produce constant values known at compile-time.
+// Replace them with register definitions that load these constants directly.
+namespace {
+ class ConstGeneration : public Transformation {
+ public:
+ ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ private:
+ bool isTfrConst(const MachineInstr *MI) const;
+ bool isConst(unsigned R, int64_t &V) const;
+ unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+}
+
+bool ConstGeneration::isConst(unsigned R, int64_t &C) const {
+ if (!BT.has(R))
+ return false;
+ const BitTracker::RegisterCell &RC = BT.lookup(R);
+ int64_t T = 0;
+ for (unsigned i = RC.width(); i > 0; --i) {
+ const BitTracker::BitValue &V = RC[i-1];
+ T <<= 1;
+ if (V.is(1))
+ T |= 1;
+ else if (!V.is(0))
+ return false;
+ }
+ C = T;
+ return true;
+}
+
+
+bool ConstGeneration::isTfrConst(const MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ case Hexagon::A2_tfrsi:
+ case Hexagon::A2_tfrpi:
+ case Hexagon::TFR_PdTrue:
+ case Hexagon::TFR_PdFalse:
+ case Hexagon::CONST32_Int_Real:
+ case Hexagon::CONST64_Int_Real:
+ return true;
+ }
+ return false;
+}
+
+
+// Generate a transfer-immediate instruction that is appropriate for the
+// register class and the actual value being transferred.
+unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+ unsigned Reg = MRI.createVirtualRegister(RC);
+ if (RC == &Hexagon::IntRegsRegClass) {
+ BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
+ .addImm(int32_t(C));
+ return Reg;
+ }
+
+ if (RC == &Hexagon::DoubleRegsRegClass) {
+ if (isInt<8>(C)) {
+ BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg)
+ .addImm(C);
+ return Reg;
+ }
+
+ unsigned Lo = Lo_32(C), Hi = Hi_32(C);
+ if (isInt<8>(Lo) || isInt<8>(Hi)) {
+ unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii
+ : Hexagon::A4_combineii;
+ BuildMI(B, At, DL, HII.get(Opc), Reg)
+ .addImm(int32_t(Hi))
+ .addImm(int32_t(Lo));
+ return Reg;
+ }
+
+ BuildMI(B, At, DL, HII.get(Hexagon::CONST64_Int_Real), Reg)
+ .addImm(C);
+ return Reg;
+ }
+
+ if (RC == &Hexagon::PredRegsRegClass) {
+ unsigned Opc;
+ if (C == 0)
+ Opc = Hexagon::TFR_PdFalse;
+ else if ((C & 0xFF) == 0xFF)
+ Opc = Hexagon::TFR_PdTrue;
+ else
+ return 0;
+ BuildMI(B, At, DL, HII.get(Opc), Reg);
+ return Reg;
+ }
+
+ return 0;
+}
+
+
+bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+ bool Changed = false;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(); I != E; ++I) {
+ if (isTfrConst(I))
+ continue;
+ Defs.clear();
+ HBS::getInstrDefs(*I, Defs);
+ if (Defs.count() != 1)
+ continue;
+ unsigned DR = Defs.find_first();
+ if (!TargetRegisterInfo::isVirtualRegister(DR))
+ continue;
+ int64_t C;
+ if (isConst(DR, C)) {
+ DebugLoc DL = I->getDebugLoc();
+ auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+ unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+ if (ImmReg) {
+ HBS::replaceReg(DR, ImmReg, MRI);
+ BT.put(ImmReg, BT.lookup(DR));
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+
+//
+// Copy generation
+//
+// Identify pairs of available registers which hold identical values.
+// In such cases, only one of them needs to be calculated, the other one
+// will be defined as a copy of the first.
+//
+// Copy propagation
+//
+// Eliminate register copies RD = RS, by replacing the uses of RD with
+// with uses of RS.
+namespace {
+ class CopyGeneration : public Transformation {
+ public:
+ CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ private:
+ bool findMatch(const BitTracker::RegisterRef &Inp,
+ BitTracker::RegisterRef &Out, const RegisterSet &AVs);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+
+ class CopyPropagation : public Transformation {
+ public:
+ CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+ : Transformation(false), MRI(mri) {}
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ static bool isCopyReg(unsigned Opc);
+ private:
+ bool propagateRegCopy(MachineInstr &MI);
+
+ MachineRegisterInfo &MRI;
+ };
+
+}
+
+
+/// Check if there is a register in AVs that is identical to Inp. If so,
+/// set Out to the found register. The output may be a pair Reg:Sub.
+bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
+ BitTracker::RegisterRef &Out, const RegisterSet &AVs) {
+ if (!BT.has(Inp.Reg))
+ return false;
+ const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg);
+ unsigned B, W;
+ if (!HBS::getSubregMask(Inp, B, W, MRI))
+ return false;
+
+ for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+ if (!BT.has(R) || !HBS::isTransparentCopy(R, Inp, MRI))
+ continue;
+ const BitTracker::RegisterCell &RC = BT.lookup(R);
+ unsigned RW = RC.width();
+ if (W == RW) {
+ if (MRI.getRegClass(Inp.Reg) != MRI.getRegClass(R))
+ continue;
+ if (!HBS::isEqual(InpRC, B, RC, 0, W))
+ continue;
+ Out.Reg = R;
+ Out.Sub = 0;
+ return true;
+ }
+ // Check if there is a super-register, whose part (with a subregister)
+ // is equal to the input.
+ // Only do double registers for now.
+ if (W*2 != RW)
+ continue;
+ if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass)
+ continue;
+
+ if (HBS::isEqual(InpRC, B, RC, 0, W))
+ Out.Sub = Hexagon::subreg_loreg;
+ else if (HBS::isEqual(InpRC, B, RC, W, W))
+ Out.Sub = Hexagon::subreg_hireg;
+ else
+ continue;
+ Out.Reg = R;
+ return true;
+ }
+ return false;
+}
+
+
+bool CopyGeneration::processBlock(MachineBasicBlock &B,
+ const RegisterSet &AVs) {
+ RegisterSet AVB(AVs);
+ bool Changed = false;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(), NextI = I; I != E;
+ ++I, AVB.insert(Defs)) {
+ NextI = std::next(I);
+ Defs.clear();
+ HBS::getInstrDefs(*I, Defs);
+
+ unsigned Opc = I->getOpcode();
+ if (CopyPropagation::isCopyReg(Opc))
+ continue;
+
+ for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+ BitTracker::RegisterRef MR;
+ if (!findMatch(R, MR, AVB))
+ continue;
+ DebugLoc DL = I->getDebugLoc();
+ auto *FRC = HBS::getFinalVRegClass(MR, MRI);
+ unsigned NewR = MRI.createVirtualRegister(FRC);
+ auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+ BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+ .addReg(MR.Reg, 0, MR.Sub);
+ BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
+ }
+ }
+
+ return Changed;
+}
+
+
+bool CopyPropagation::isCopyReg(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case TargetOpcode::REG_SEQUENCE:
+ case Hexagon::A2_tfr:
+ case Hexagon::A2_tfrp:
+ case Hexagon::A2_combinew:
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+
+bool CopyPropagation::propagateRegCopy(MachineInstr &MI) {
+ bool Changed = false;
+ unsigned Opc = MI.getOpcode();
+ BitTracker::RegisterRef RD = MI.getOperand(0);
+ assert(MI.getOperand(0).getSubReg() == 0);
+
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case Hexagon::A2_tfr:
+ case Hexagon::A2_tfrp: {
+ BitTracker::RegisterRef RS = MI.getOperand(1);
+ if (!HBS::isTransparentCopy(RD, RS, MRI))
+ break;
+ if (RS.Sub != 0)
+ Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI);
+ else
+ Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI);
+ break;
+ }
+ case TargetOpcode::REG_SEQUENCE: {
+ BitTracker::RegisterRef SL, SH;
+ if (HBS::parseRegSequence(MI, SL, SH)) {
+ Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg,
+ SL.Reg, SL.Sub, MRI);
+ Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg,
+ SH.Reg, SH.Sub, MRI);
+ }
+ break;
+ }
+ case Hexagon::A2_combinew: {
+ BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2);
+ Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg,
+ RL.Reg, RL.Sub, MRI);
+ Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg,
+ RH.Reg, RH.Sub, MRI);
+ break;
+ }
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineri: {
+ unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1;
+ unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::subreg_loreg
+ : Hexagon::subreg_hireg;
+ BitTracker::RegisterRef RS = MI.getOperand(SrcX);
+ Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI);
+ break;
+ }
+ }
+ return Changed;
+}
+
+
+bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+ std::vector<MachineInstr*> Instrs;
+ for (auto I = B.rbegin(), E = B.rend(); I != E; ++I)
+ Instrs.push_back(&*I);
+
+ bool Changed = false;
+ for (auto I : Instrs) {
+ unsigned Opc = I->getOpcode();
+ if (!CopyPropagation::isCopyReg(Opc))
+ continue;
+ Changed |= propagateRegCopy(*I);
+ }
+
+ return Changed;
+}
+
+
+//
+// Bit simplification
+//
+// Recognize patterns that can be simplified and replace them with the
+// simpler forms.
+// This is by no means complete
+namespace {
+ class BitSimplification : public Transformation {
+ public:
+ BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
+ MachineRegisterInfo &mri)
+ : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+ bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+ private:
+ struct RegHalf : public BitTracker::RegisterRef {
+ bool Low; // Low/High halfword.
+ };
+
+ bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC,
+ unsigned B, RegHalf &RH);
+
+ bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC,
+ BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt);
+ unsigned getCombineOpcode(bool HLow, bool LLow);
+
+ bool genStoreUpperHalf(MachineInstr *MI);
+ bool genStoreImmediate(MachineInstr *MI);
+ bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+ bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
+ const BitTracker::RegisterCell &RC);
+
+ const HexagonInstrInfo &HII;
+ MachineRegisterInfo &MRI;
+ BitTracker &BT;
+ };
+}
+
+
+// Check if the bits [B..B+16) in register cell RC form a valid halfword,
+// i.e. [0..16), [16..32), etc. of some register. If so, return true and
+// set the information about the found register in RH.
+bool BitSimplification::matchHalf(unsigned SelfR,
+ const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) {
+ // XXX This could be searching in the set of available registers, in case
+ // the match is not exact.
+
+ // Match 16-bit chunks, where the RC[B..B+15] references exactly one
+ // register and all the bits B..B+15 match between RC and the register.
+ // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... },
+ // and RC = { [0]:0 [1-15]:v1[1-15]... }.
+ bool Low = false;
+ unsigned I = B;
+ while (I < B+16 && RC[I].num())
+ I++;
+ if (I == B+16)
+ return false;
+
+ unsigned Reg = RC[I].RefI.Reg;
+ unsigned P = RC[I].RefI.Pos; // The RefI.Pos will be advanced by I-B.
+ if (P < I-B)
+ return false;
+ unsigned Pos = P - (I-B);
+
+ if (Reg == 0 || Reg == SelfR) // Don't match "self".
+ return false;
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return false;
+ if (!BT.has(Reg))
+ return false;
+
+ const BitTracker::RegisterCell &SC = BT.lookup(Reg);
+ if (Pos+16 > SC.width())
+ return false;
+
+ for (unsigned i = 0; i < 16; ++i) {
+ const BitTracker::BitValue &RV = RC[i+B];
+ if (RV.Type == BitTracker::BitValue::Ref) {
+ if (RV.RefI.Reg != Reg)
+ return false;
+ if (RV.RefI.Pos != i+Pos)
+ return false;
+ continue;
+ }
+ if (RC[i+B] != SC[i+Pos])
+ return false;
+ }
+
+ unsigned Sub = 0;
+ switch (Pos) {
+ case 0:
+ Sub = Hexagon::subreg_loreg;
+ Low = true;
+ break;
+ case 16:
+ Sub = Hexagon::subreg_loreg;
+ Low = false;
+ break;
+ case 32:
+ Sub = Hexagon::subreg_hireg;
+ Low = true;
+ break;
+ case 48:
+ Sub = Hexagon::subreg_hireg;
+ Low = false;
+ break;
+ default:
+ return false;
+ }
+
+ RH.Reg = Reg;
+ RH.Sub = Sub;
+ RH.Low = Low;
+ // If the subregister is not valid with the register, set it to 0.
+ if (!HBS::getFinalVRegClass(RH, MRI))
+ RH.Sub = 0;
+
+ return true;
+}
+
+
+// Check if RC matches the pattern of a S2_packhl. If so, return true and
+// set the inputs Rs and Rt.
+bool BitSimplification::matchPackhl(unsigned SelfR,
+ const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs,
+ BitTracker::RegisterRef &Rt) {
+ RegHalf L1, H1, L2, H2;
+
+ if (!matchHalf(SelfR, RC, 0, L2) || !matchHalf(SelfR, RC, 16, L1))
+ return false;
+ if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1))
+ return false;
+
+ // Rs = H1.L1, Rt = H2.L2
+ if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low)
+ return false;
+ if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low)
+ return false;
+
+ Rs = H1;
+ Rt = H2;
+ return true;
+}
+
+
+unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) {
+ return HLow ? LLow ? Hexagon::A2_combine_ll
+ : Hexagon::A2_combine_lh
+ : LLow ? Hexagon::A2_combine_hl
+ : Hexagon::A2_combine_hh;
+}
+
+
+// If MI stores the upper halfword of a register (potentially obtained via
+// shifts or extracts), replace it with a storerf instruction. This could
+// cause the "extraction" code to become dead.
+bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc != Hexagon::S2_storerh_io)
+ return false;
+
+ MachineOperand &ValOp = MI->getOperand(2);
+ BitTracker::RegisterRef RS = ValOp;
+ if (!BT.has(RS.Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+ RegHalf H;
+ if (!matchHalf(0, RC, 0, H))
+ return false;
+ if (H.Low)
+ return false;
+ MI->setDesc(HII.get(Hexagon::S2_storerf_io));
+ ValOp.setReg(H.Reg);
+ ValOp.setSubReg(H.Sub);
+ return true;
+}
+
+
+// If MI stores a value known at compile-time, and the value is within a range
+// that avoids using constant-extenders, replace it with a store-immediate.
+bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ unsigned Align = 0;
+ switch (Opc) {
+ case Hexagon::S2_storeri_io:
+ Align++;
+ case Hexagon::S2_storerh_io:
+ Align++;
+ case Hexagon::S2_storerb_io:
+ break;
+ default:
+ return false;
+ }
+
+ // Avoid stores to frame-indices (due to an unknown offset).
+ if (!MI->getOperand(0).isReg())
+ return false;
+ MachineOperand &OffOp = MI->getOperand(1);
+ if (!OffOp.isImm())
+ return false;
+
+ int64_t Off = OffOp.getImm();
+ // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x).
+ if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1)))
+ return false;
+ // Source register:
+ BitTracker::RegisterRef RS = MI->getOperand(2);
+ if (!BT.has(RS.Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+ uint64_t U;
+ if (!HBS::getConst(RC, 0, RC.width(), U))
+ return false;
+
+ // Only consider 8-bit values to avoid constant-extenders.
+ int V;
+ switch (Opc) {
+ case Hexagon::S2_storerb_io:
+ V = int8_t(U);
+ break;
+ case Hexagon::S2_storerh_io:
+ V = int16_t(U);
+ break;
+ case Hexagon::S2_storeri_io:
+ V = int32_t(U);
+ break;
+ }
+ if (!isInt<8>(V))
+ return false;
+
+ MI->RemoveOperand(2);
+ switch (Opc) {
+ case Hexagon::S2_storerb_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeirb_io));
+ break;
+ case Hexagon::S2_storerh_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeirh_io));
+ break;
+ case Hexagon::S2_storeri_io:
+ MI->setDesc(HII.get(Hexagon::S4_storeiri_io));
+ break;
+ }
+ MI->addOperand(MachineOperand::CreateImm(V));
+ return true;
+}
+
+
+// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the
+// last instruction in a sequence that results in something equivalent to
+// the pack-halfwords. The intent is to cause the entire sequence to become
+// dead.
+bool BitSimplification::genPackhl(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == Hexagon::S2_packhl)
+ return false;
+ BitTracker::RegisterRef Rs, Rt;
+ if (!matchPackhl(RD.Reg, RC, Rs, Rt))
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(B, MI, DL, HII.get(Hexagon::S2_packhl), NewR)
+ .addReg(Rs.Reg, 0, Rs.Sub)
+ .addReg(Rt.Reg, 0, Rt.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+
+// If MI produces halfword of the input in the low half of the output,
+// replace it with zero-extend or extractu.
+bool BitSimplification::genExtractHalf(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ RegHalf L;
+ // Check for halfword in low 16 bits, zeros elsewhere.
+ if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16))
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Prefer zxth, since zxth can go in any slot, while extractu only in
+ // slots 2 and 3.
+ unsigned NewR = 0;
+ if (L.Low && Opc != Hexagon::A2_zxth) {
+ NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, MI, DL, HII.get(Hexagon::A2_zxth), NewR)
+ .addReg(L.Reg, 0, L.Sub);
+ } else if (!L.Low && Opc != Hexagon::S2_extractu) {
+ NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, MI, DL, HII.get(Hexagon::S2_extractu), NewR)
+ .addReg(L.Reg, 0, L.Sub)
+ .addImm(16)
+ .addImm(16);
+ }
+ if (NewR == 0)
+ return false;
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+
+// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the
+// combine.
+bool BitSimplification::genCombineHalf(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ RegHalf L, H;
+ // Check for combine h/l
+ if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H))
+ return false;
+ // Do nothing if this is just a reg copy.
+ if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low)
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ unsigned COpc = getCombineOpcode(H.Low, L.Low);
+ if (COpc == Opc)
+ return false;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(B, MI, DL, HII.get(COpc), NewR)
+ .addReg(H.Reg, 0, H.Sub)
+ .addReg(L.Reg, 0, L.Sub);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+}
+
+
+// If MI resets high bits of a register and keeps the lower ones, replace it
+// with zero-extend byte/half, and-immediate, or extractu, as appropriate.
+bool BitSimplification::genExtractLow(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ case Hexagon::S2_extractu:
+ return false;
+ }
+ if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) {
+ int32_t Imm = MI->getOperand(2).getImm();
+ if (isInt<10>(Imm))
+ return false;
+ }
+
+ if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+ return false;
+ unsigned W = RC.width();
+ while (W > 0 && RC[W-1].is(0))
+ W--;
+ if (W == 0 || W == RC.width())
+ return false;
+ unsigned NewOpc = (W == 8) ? Hexagon::A2_zxtb
+ : (W == 16) ? Hexagon::A2_zxth
+ : (W < 10) ? Hexagon::A2_andir
+ : Hexagon::S2_extractu;
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ for (auto &Op : MI->uses()) {
+ if (!Op.isReg())
+ continue;
+ BitTracker::RegisterRef RS = Op;
+ if (!BT.has(RS.Reg))
+ continue;
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ unsigned BN, BW;
+ if (!HBS::getSubregMask(RS, BN, BW, MRI))
+ continue;
+ if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W))
+ continue;
+
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ auto MIB = BuildMI(B, MI, DL, HII.get(NewOpc), NewR)
+ .addReg(RS.Reg, 0, RS.Sub);
+ if (NewOpc == Hexagon::A2_andir)
+ MIB.addImm((1 << W) - 1);
+ else if (NewOpc == Hexagon::S2_extractu)
+ MIB.addImm(W).addImm(0);
+ HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+ BT.put(BitTracker::RegisterRef(NewR), RC);
+ return true;
+ }
+ return false;
+}
+
+
+// Check for tstbit simplification opportunity, where the bit being checked
+// can be tracked back to another register. For example:
+// vreg2 = S2_lsr_i_r vreg1, 5
+// vreg3 = S2_tstbit_i vreg2, 0
+// =>
+// vreg3 = S2_tstbit_i vreg1, 5
+bool BitSimplification::simplifyTstbit(MachineInstr *MI,
+ BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc != Hexagon::S2_tstbit_i)
+ return false;
+
+ unsigned BN = MI->getOperand(2).getImm();
+ BitTracker::RegisterRef RS = MI->getOperand(1);
+ unsigned F, W;
+ DebugLoc DL = MI->getDebugLoc();
+ if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI))
+ return false;
+ MachineBasicBlock &B = *MI->getParent();
+
+ const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+ const BitTracker::BitValue &V = SC[F+BN];
+ if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) {
+ const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg);
+ // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is
+ // a double register, need to use a subregister and adjust bit
+ // number.
+ unsigned P = UINT_MAX;
+ BitTracker::RegisterRef RR(V.RefI.Reg, 0);
+ if (TC == &Hexagon::DoubleRegsRegClass) {
+ P = V.RefI.Pos;
+ RR.Sub = Hexagon::subreg_loreg;
+ if (P >= 32) {
+ P -= 32;
+ RR.Sub = Hexagon::subreg_hireg;
+ }
+ } else if (TC == &Hexagon::IntRegsRegClass) {
+ P = V.RefI.Pos;
+ }
+ if (P != UINT_MAX) {
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ BuildMI(B, MI, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
+ .addReg(RR.Reg, 0, RR.Sub)
+ .addImm(P);
+ HBS::replaceReg(RD.Reg, NewR, MRI);
+ BT.put(NewR, RC);
+ return true;
+ }
+ } else if (V.is(0) || V.is(1)) {
+ unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue;
+ BuildMI(B, MI, DL, HII.get(NewOpc), NewR);
+ HBS::replaceReg(RD.Reg, NewR, MRI);
+ return true;
+ }
+
+ return false;
+}
+
+
+bool BitSimplification::processBlock(MachineBasicBlock &B,
+ const RegisterSet &AVs) {
+ bool Changed = false;
+ RegisterSet AVB = AVs;
+ RegisterSet Defs;
+
+ for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) {
+ MachineInstr *MI = &*I;
+ Defs.clear();
+ HBS::getInstrDefs(*MI, Defs);
+
+ unsigned Opc = MI->getOpcode();
+ if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE)
+ continue;
+
+ if (MI->mayStore()) {
+ bool T = genStoreUpperHalf(MI);
+ T = T || genStoreImmediate(MI);
+ Changed |= T;
+ continue;
+ }
+
+ if (Defs.count() != 1)
+ continue;
+ const MachineOperand &Op0 = MI->getOperand(0);
+ if (!Op0.isReg() || !Op0.isDef())
+ continue;
+ BitTracker::RegisterRef RD = Op0;
+ if (!BT.has(RD.Reg))
+ continue;
+ const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+ const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg);
+
+ if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
+ bool T = genPackhl(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+
+ if (FRC->getID() == Hexagon::IntRegsRegClassID) {
+ bool T = genExtractHalf(MI, RD, RC);
+ T = T || genCombineHalf(MI, RD, RC);
+ T = T || genExtractLow(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+
+ if (FRC->getID() == Hexagon::PredRegsRegClassID) {
+ bool T = simplifyTstbit(MI, RD, RC);
+ Changed |= T;
+ continue;
+ }
+ }
+ return Changed;
+}
+
+
+bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HRI = *HST.getRegisterInfo();
+ auto &HII = *HST.getInstrInfo();
+
+ MDT = &getAnalysis<MachineDominatorTree>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool Changed;
+
+ Changed = DeadCodeElimination(MF, *MDT).run();
+
+ const HexagonEvaluator HE(HRI, MRI, HII, MF);
+ BitTracker BT(HE, MF);
+ DEBUG(BT.trace(true));
+ BT.run();
+
+ MachineBasicBlock &Entry = MF.front();
+
+ RegisterSet AIG; // Available registers for IG.
+ ConstGeneration ImmG(BT, HII, MRI);
+ Changed |= visitBlock(Entry, ImmG, AIG);
+
+ RegisterSet ARE; // Available registers for RIE.
+ RedundantInstrElimination RIE(BT, HII, MRI);
+ Changed |= visitBlock(Entry, RIE, ARE);
+
+ RegisterSet ACG; // Available registers for CG.
+ CopyGeneration CopyG(BT, HII, MRI);
+ Changed |= visitBlock(Entry, CopyG, ACG);
+
+ RegisterSet ACP; // Available registers for CP.
+ CopyPropagation CopyP(HRI, MRI);
+ Changed |= visitBlock(Entry, CopyP, ACP);
+
+ Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+ BT.run();
+ RegisterSet ABS; // Available registers for BS.
+ BitSimplification BitS(BT, HII, MRI);
+ Changed |= visitBlock(Entry, BitS, ABS);
+
+ Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+ if (Changed) {
+ for (auto &B : MF)
+ for (auto &I : B)
+ I.clearKillInfo();
+ DeadCodeElimination(MF, *MDT).run();
+ }
+ return Changed;
+}
+
+
+// Recognize loops where the code at the end of the loop matches the code
+// before the entry of the loop, and the matching code is such that is can
+// be simplified. This pass relies on the bit simplification above and only
+// prepares code in a way that can be handled by the bit simplifcation.
+//
+// This is the motivating testcase (and explanation):
+//
+// {
+// loop0(.LBB0_2, r1) // %for.body.preheader
+// r5:4 = memd(r0++#8)
+// }
+// {
+// r3 = lsr(r4, #16)
+// r7:6 = combine(r5, r5)
+// }
+// {
+// r3 = insert(r5, #16, #16)
+// r7:6 = vlsrw(r7:6, #16)
+// }
+// .LBB0_2:
+// {
+// memh(r2+#4) = r5
+// memh(r2+#6) = r6 # R6 is really R5.H
+// }
+// {
+// r2 = add(r2, #8)
+// memh(r2+#0) = r4
+// memh(r2+#2) = r3 # R3 is really R4.H
+// }
+// {
+// r5:4 = memd(r0++#8)
+// }
+// { # "Shuffling" code that sets up R3 and R6
+// r3 = lsr(r4, #16) # so that their halves can be stored in the
+// r7:6 = combine(r5, r5) # next iteration. This could be folded into
+// } # the stores if the code was at the beginning
+// { # of the loop iteration. Since the same code
+// r3 = insert(r5, #16, #16) # precedes the loop, it can actually be moved
+// r7:6 = vlsrw(r7:6, #16) # there.
+// }:endloop0
+//
+//
+// The outcome:
+//
+// {
+// loop0(.LBB0_2, r1)
+// r5:4 = memd(r0++#8)
+// }
+// .LBB0_2:
+// {
+// memh(r2+#4) = r5
+// memh(r2+#6) = r5.h
+// }
+// {
+// r2 = add(r2, #8)
+// memh(r2+#0) = r4
+// memh(r2+#2) = r4.h
+// }
+// {
+// r5:4 = memd(r0++#8)
+// }:endloop0
+
+namespace llvm {
+ FunctionPass *createHexagonLoopRescheduling();
+ void initializeHexagonLoopReschedulingPass(PassRegistry&);
+}
+
+namespace {
+ class HexagonLoopRescheduling : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonLoopRescheduling() : MachineFunctionPass(ID),
+ HII(0), HRI(0), MRI(0), BTP(0) {
+ initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+ MachineRegisterInfo *MRI;
+ BitTracker *BTP;
+
+ struct LoopCand {
+ LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb,
+ MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {}
+ MachineBasicBlock *LB, *PB, *EB;
+ };
+ typedef std::vector<MachineInstr*> InstrList;
+ struct InstrGroup {
+ BitTracker::RegisterRef Inp, Out;
+ InstrList Ins;
+ };
+ struct PhiInfo {
+ PhiInfo(MachineInstr &P, MachineBasicBlock &B);
+ unsigned DefR;
+ BitTracker::RegisterRef LR, PR;
+ MachineBasicBlock *LB, *PB;
+ };
+
+ static unsigned getDefReg(const MachineInstr *MI);
+ bool isConst(unsigned Reg) const;
+ bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const;
+ bool isStoreInput(const MachineInstr *MI, unsigned DefR) const;
+ bool isShuffleOf(unsigned OutR, unsigned InpR) const;
+ bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2,
+ unsigned &InpR2) const;
+ void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB,
+ MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR);
+ bool processLoop(LoopCand &C);
+ };
+}
+
+char HexagonLoopRescheduling::ID = 0;
+
+INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched",
+ "Hexagon Loop Rescheduling", false, false)
+
+
+HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P,
+ MachineBasicBlock &B) {
+ DefR = HexagonLoopRescheduling::getDefReg(&P);
+ LB = &B;
+ PB = nullptr;
+ for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) {
+ const MachineOperand &OpB = P.getOperand(i+1);
+ if (OpB.getMBB() == &B) {
+ LR = P.getOperand(i);
+ continue;
+ }
+ PB = OpB.getMBB();
+ PR = P.getOperand(i);
+ }
+}
+
+
+unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) {
+ RegisterSet Defs;
+ HBS::getInstrDefs(*MI, Defs);
+ if (Defs.count() != 1)
+ return 0;
+ return Defs.find_first();
+}
+
+
+bool HexagonLoopRescheduling::isConst(unsigned Reg) const {
+ if (!BTP->has(Reg))
+ return false;
+ const BitTracker::RegisterCell &RC = BTP->lookup(Reg);
+ for (unsigned i = 0, w = RC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = RC[i];
+ if (!V.is(0) && !V.is(1))
+ return false;
+ }
+ return true;
+}
+
+
+bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI,
+ unsigned DefR) const {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case TargetOpcode::COPY:
+ case Hexagon::S2_lsr_i_r:
+ case Hexagon::S2_asr_i_r:
+ case Hexagon::S2_asl_i_r:
+ case Hexagon::S2_lsr_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_insert:
+ case Hexagon::A2_or:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_and:
+ case Hexagon::A2_andp:
+ case Hexagon::A2_combinew:
+ case Hexagon::A4_combineri:
+ case Hexagon::A4_combineir:
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ case Hexagon::A2_combine_ll:
+ case Hexagon::A2_combine_lh:
+ case Hexagon::A2_combine_hl:
+ case Hexagon::A2_combine_hh:
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI,
+ unsigned InpR) const {
+ for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+ const MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg())
+ continue;
+ if (Op.getReg() == InpR)
+ return i == n-1;
+ }
+ return false;
+}
+
+
+bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const {
+ if (!BTP->has(OutR) || !BTP->has(InpR))
+ return false;
+ const BitTracker::RegisterCell &OutC = BTP->lookup(OutR);
+ for (unsigned i = 0, w = OutC.width(); i < w; ++i) {
+ const BitTracker::BitValue &V = OutC[i];
+ if (V.Type != BitTracker::BitValue::Ref)
+ continue;
+ if (V.RefI.Reg != InpR)
+ return false;
+ }
+ return true;
+}
+
+
+bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1,
+ unsigned OutR2, unsigned &InpR2) const {
+ if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2))
+ return false;
+ const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1);
+ const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2);
+ unsigned W = OutC1.width();
+ unsigned MatchR = 0;
+ if (W != OutC2.width())
+ return false;
+ for (unsigned i = 0; i < W; ++i) {
+ const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i];
+ if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One)
+ return false;
+ if (V1.Type != BitTracker::BitValue::Ref)
+ continue;
+ if (V1.RefI.Pos != V2.RefI.Pos)
+ return false;
+ if (V1.RefI.Reg != InpR1)
+ return false;
+ if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2)
+ return false;
+ if (!MatchR)
+ MatchR = V2.RefI.Reg;
+ else if (V2.RefI.Reg != MatchR)
+ return false;
+ }
+ InpR2 = MatchR;
+ return true;
+}
+
+
+void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
+ MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR,
+ unsigned NewPredR) {
+ DenseMap<unsigned,unsigned> RegMap;
+
+ const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
+ unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+ BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
+ .addReg(NewPredR)
+ .addMBB(&PB)
+ .addReg(G.Inp.Reg)
+ .addMBB(&LB);
+ RegMap.insert(std::make_pair(G.Inp.Reg, PhiR));
+
+ for (unsigned i = G.Ins.size(); i > 0; --i) {
+ const MachineInstr *SI = G.Ins[i-1];
+ unsigned DR = getDefReg(SI);
+ const TargetRegisterClass *RC = MRI->getRegClass(DR);
+ unsigned NewDR = MRI->createVirtualRegister(RC);
+ DebugLoc DL = SI->getDebugLoc();
+
+ auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
+ for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
+ const MachineOperand &Op = SI->getOperand(j);
+ if (!Op.isReg()) {
+ MIB.addOperand(Op);
+ continue;
+ }
+ if (!Op.isUse())
+ continue;
+ unsigned UseR = RegMap[Op.getReg()];
+ MIB.addReg(UseR, 0, Op.getSubReg());
+ }
+ RegMap.insert(std::make_pair(DR, NewDR));
+ }
+
+ HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI);
+}
+
+
+bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
+ DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n");
+ std::vector<PhiInfo> Phis;
+ for (auto &I : *C.LB) {
+ if (!I.isPHI())
+ break;
+ unsigned PR = getDefReg(&I);
+ if (isConst(PR))
+ continue;
+ bool BadUse = false, GoodUse = false;
+ for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) {
+ MachineInstr *UseI = UI->getParent();
+ if (UseI->getParent() != C.LB) {
+ BadUse = true;
+ break;
+ }
+ if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR))
+ GoodUse = true;
+ }
+ if (BadUse || !GoodUse)
+ continue;
+
+ Phis.push_back(PhiInfo(I, *C.LB));
+ }
+
+ DEBUG({
+ dbgs() << "Phis: {";
+ for (auto &I : Phis) {
+ dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi("
+ << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber()
+ << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b"
+ << I.LB->getNumber() << ')';
+ }
+ dbgs() << " }\n";
+ });
+
+ if (Phis.empty())
+ return false;
+
+ bool Changed = false;
+ InstrList ShufIns;
+
+ // Go backwards in the block: for each bit shuffling instruction, check
+ // if that instruction could potentially be moved to the front of the loop:
+ // the output of the loop cannot be used in a non-shuffling instruction
+ // in this loop.
+ for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) {
+ if (I->isTerminator())
+ continue;
+ if (I->isPHI())
+ break;
+
+ RegisterSet Defs;
+ HBS::getInstrDefs(*I, Defs);
+ if (Defs.count() != 1)
+ continue;
+ unsigned DefR = Defs.find_first();
+ if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ continue;
+ if (!isBitShuffle(&*I, DefR))
+ continue;
+
+ bool BadUse = false;
+ for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) {
+ MachineInstr *UseI = UI->getParent();
+ if (UseI->getParent() == C.LB) {
+ if (UseI->isPHI()) {
+ // If the use is in a phi node in this loop, then it should be
+ // the value corresponding to the back edge.
+ unsigned Idx = UI.getOperandNo();
+ if (UseI->getOperand(Idx+1).getMBB() != C.LB)
+ BadUse = true;
+ } else {
+ auto F = std::find(ShufIns.begin(), ShufIns.end(), UseI);
+ if (F == ShufIns.end())
+ BadUse = true;
+ }
+ } else {
+ // There is a use outside of the loop, but there is no epilog block
+ // suitable for a copy-out.
+ if (C.EB == nullptr)
+ BadUse = true;
+ }
+ if (BadUse)
+ break;
+ }
+
+ if (BadUse)
+ continue;
+ ShufIns.push_back(&*I);
+ }
+
+ // Partition the list of shuffling instructions into instruction groups,
+ // where each group has to be moved as a whole (i.e. a group is a chain of
+ // dependent instructions). A group produces a single live output register,
+ // which is meant to be the input of the loop phi node (although this is
+ // not checked here yet). It also uses a single register as its input,
+ // which is some value produced in the loop body. After moving the group
+ // to the beginning of the loop, that input register would need to be
+ // the loop-carried register (through a phi node) instead of the (currently
+ // loop-carried) output register.
+ typedef std::vector<InstrGroup> InstrGroupList;
+ InstrGroupList Groups;
+
+ for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) {
+ MachineInstr *SI = ShufIns[i];
+ if (SI == nullptr)
+ continue;
+
+ InstrGroup G;
+ G.Ins.push_back(SI);
+ G.Out.Reg = getDefReg(SI);
+ RegisterSet Inputs;
+ HBS::getInstrUses(*SI, Inputs);
+
+ for (unsigned j = i+1; j < n; ++j) {
+ MachineInstr *MI = ShufIns[j];
+ if (MI == nullptr)
+ continue;
+ RegisterSet Defs;
+ HBS::getInstrDefs(*MI, Defs);
+ // If this instruction does not define any pending inputs, skip it.
+ if (!Defs.intersects(Inputs))
+ continue;
+ // Otherwise, add it to the current group and remove the inputs that
+ // are defined by MI.
+ G.Ins.push_back(MI);
+ Inputs.remove(Defs);
+ // Then add all registers used by MI.
+ HBS::getInstrUses(*MI, Inputs);
+ ShufIns[j] = nullptr;
+ }
+
+ // Only add a group if it requires at most one register.
+ if (Inputs.count() > 1)
+ continue;
+ auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+ return G.Out.Reg == P.LR.Reg;
+ };
+ if (std::find_if(Phis.begin(), Phis.end(), LoopInpEq) == Phis.end())
+ continue;
+
+ G.Inp.Reg = Inputs.find_first();
+ Groups.push_back(G);
+ }
+
+ DEBUG({
+ for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+ InstrGroup &G = Groups[i];
+ dbgs() << "Group[" << i << "] inp: "
+ << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub)
+ << " out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
+ for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
+ dbgs() << " " << *G.Ins[j];
+ }
+ });
+
+ for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+ InstrGroup &G = Groups[i];
+ if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
+ continue;
+ auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+ return G.Out.Reg == P.LR.Reg;
+ };
+ auto F = std::find_if(Phis.begin(), Phis.end(), LoopInpEq);
+ if (F == Phis.end())
+ continue;
+ unsigned PredR = 0;
+ if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PredR)) {
+ const MachineInstr *DefPredR = MRI->getVRegDef(F->PR.Reg);
+ unsigned Opc = DefPredR->getOpcode();
+ if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi)
+ continue;
+ if (!DefPredR->getOperand(1).isImm())
+ continue;
+ if (DefPredR->getOperand(1).getImm() != 0)
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg);
+ if (RC != MRI->getRegClass(F->PR.Reg)) {
+ PredR = MRI->createVirtualRegister(RC);
+ unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi
+ : Hexagon::A2_tfrpi;
+ auto T = C.PB->getFirstTerminator();
+ DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc();
+ BuildMI(*C.PB, T, DL, HII->get(TfrI), PredR)
+ .addImm(0);
+ } else {
+ PredR = F->PR.Reg;
+ }
+ }
+ assert(MRI->getRegClass(PredR) == MRI->getRegClass(G.Inp.Reg));
+ moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PredR);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+
+bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ HII = HST.getInstrInfo();
+ HRI = HST.getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+ BitTracker BT(HE, MF);
+ DEBUG(BT.trace(true));
+ BT.run();
+ BTP = &BT;
+
+ std::vector<LoopCand> Cand;
+
+ for (auto &B : MF) {
+ if (B.pred_size() != 2 || B.succ_size() != 2)
+ continue;
+ MachineBasicBlock *PB = nullptr;
+ bool IsLoop = false;
+ for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) {
+ if (*PI != &B)
+ PB = *PI;
+ else
+ IsLoop = true;
+ }
+ if (!IsLoop)
+ continue;
+
+ MachineBasicBlock *EB = nullptr;
+ for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) {
+ if (*SI == &B)
+ continue;
+ // Set EP to the epilog block, if it has only 1 predecessor (i.e. the
+ // edge from B to EP is non-critical.
+ if ((*SI)->pred_size() == 1)
+ EB = *SI;
+ break;
+ }
+
+ Cand.push_back(LoopCand(&B, PB, EB));
+ }
+
+ bool Changed = false;
+ for (auto &C : Cand)
+ Changed |= processLoop(C);
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopRescheduling() {
+ return new HexagonLoopRescheduling();
+}
+
+FunctionPass *llvm::createHexagonBitSimplify() {
+ return new HexagonBitSimplify();
+}
+
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index 021e58a1d08a..d5848dc45a3b 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -84,6 +84,8 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
switch (ID) {
case DoubleRegsRegClassID:
+ case VecDblRegsRegClassID:
+ case VecDblRegs128BRegClassID:
return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1)
: BT::BitMask(RW, 2*RW-1);
default:
@@ -95,30 +97,29 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
llvm_unreachable("Unexpected register/subregister");
}
-
namespace {
- struct RegisterRefs : public std::vector<BT::RegisterRef> {
- typedef std::vector<BT::RegisterRef> Base;
- RegisterRefs(const MachineInstr *MI);
- const BT::RegisterRef &operator[](unsigned n) const {
- // The main purpose of this operator is to assert with bad argument.
- assert(n < size());
- return Base::operator[](n);
- }
- };
+class RegisterRefs {
+ std::vector<BT::RegisterRef> Vector;
- RegisterRefs::RegisterRefs(const MachineInstr *MI)
- : Base(MI->getNumOperands()) {
- for (unsigned i = 0, n = size(); i < n; ++i) {
+public:
+ RegisterRefs(const MachineInstr *MI) : Vector(MI->getNumOperands()) {
+ for (unsigned i = 0, n = Vector.size(); i < n; ++i) {
const MachineOperand &MO = MI->getOperand(i);
if (MO.isReg())
- at(i) = BT::RegisterRef(MO);
+ Vector[i] = BT::RegisterRef(MO);
// For indices that don't correspond to registers, the entry will
// remain constructed via the default constructor.
}
}
-}
+ size_t size() const { return Vector.size(); }
+ const BT::RegisterRef &operator[](unsigned n) const {
+ // The main purpose of this operator is to assert with bad argument.
+ assert(n < Vector.size());
+ return Vector[n];
+ }
+};
+}
bool HexagonEvaluator::evaluate(const MachineInstr *MI,
const CellMapType &Inputs, CellMapType &Outputs) const {
@@ -189,7 +190,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI,
return true;
};
// Get the cell corresponding to the N-th operand.
- auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W)
+ auto cop = [this,&Reg,&MI,&Inputs] (unsigned N, uint16_t W)
-> BT::RegisterCell {
const MachineOperand &Op = MI->getOperand(N);
if (Op.isImm())
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 3753b745657b..efafdd007289 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -102,7 +102,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
// Loop over all of the basic blocks.
for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
// Traverse the basic block.
MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
@@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
if (case1 || case2) {
InvertAndChangeJumpTarget(MI, UncondTarget);
- MBB->removeSuccessor(JumpAroundTarget);
- MBB->addSuccessor(UncondTarget);
+ MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
// Remove the unconditional branch in LayoutSucc.
LayoutSucc->erase(LayoutSucc->begin());
- LayoutSucc->removeSuccessor(UncondTarget);
- LayoutSucc->addSuccessor(JumpAroundTarget);
+ LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
// This code performs the conversion for case 2, which moves
// the block to the fall-thru case (BB3 in the code above).
@@ -210,16 +208,15 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
// The live-in to LayoutSucc is now all values live-in to
// JumpAroundTarget.
//
- std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(),
- LayoutSucc->livein_end());
- std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(),
- JumpAroundTarget->livein_end());
- for (unsigned i = 0; i < OrigLiveIn.size(); ++i) {
- LayoutSucc->removeLiveIn(OrigLiveIn[i]);
- }
- for (unsigned i = 0; i < NewLiveIn.size(); ++i) {
- LayoutSucc->addLiveIn(NewLiveIn[i]);
- }
+ std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn(
+ LayoutSucc->livein_begin(), LayoutSucc->livein_end());
+ std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn(
+ JumpAroundTarget->livein_begin(),
+ JumpAroundTarget->livein_end());
+ for (const auto &OrigLI : OrigLiveIn)
+ LayoutSucc->removeLiveIn(OrigLI.PhysReg);
+ for (const auto &NewLI : NewLiveIn)
+ LayoutSucc->addLiveIn(NewLI);
}
}
}
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 9f5fac156527..931db6687bf8 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -59,30 +59,23 @@ namespace {
// Numbering map for gep nodes. Used to keep track of ordering for
// gep nodes.
- struct NodeNumbering : public std::map<const GepNode*,unsigned> {
- };
-
- struct NodeOrdering : public NodeNumbering {
+ struct NodeOrdering {
NodeOrdering() : LastNum(0) {}
-#ifdef _MSC_VER
- void special_insert_for_special_msvc(const GepNode *N)
-#else
- using NodeNumbering::insert;
- void insert(const GepNode* N)
-#endif
- {
- insert(std::make_pair(N, ++LastNum));
- }
- bool operator() (const GepNode* N1, const GepNode *N2) const {
- const_iterator F1 = find(N1), F2 = find(N2);
- assert(F1 != end() && F2 != end());
+
+ void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); }
+ void clear() { Map.clear(); }
+
+ bool operator()(const GepNode *N1, const GepNode *N2) const {
+ auto F1 = Map.find(N1), F2 = Map.find(N2);
+ assert(F1 != Map.end() && F2 != Map.end());
return F1->second < F2->second;
}
+
private:
+ std::map<const GepNode *, unsigned> Map;
unsigned LastNum;
};
-
class HexagonCommonGEP : public FunctionPass {
public:
static char ID;
@@ -360,11 +353,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
Us.insert(&UI.getUse());
}
Nodes.push_back(N);
-#ifdef _MSC_VER
- NodeOrder.special_insert_for_special_msvc(N);
-#else
NodeOrder.insert(N);
-#endif
// Skip the first index operand, since we only handle 0. This dereferences
// the pointer operand.
@@ -379,11 +368,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
Nx->PTy = PtrTy;
Nx->Idx = Op;
Nodes.push_back(Nx);
-#ifdef _MSC_VER
- NodeOrder.special_insert_for_special_msvc(Nx);
-#else
NodeOrder.insert(Nx);
-#endif
PN = Nx;
PtrTy = next_type(PtrTy, Op);
@@ -404,7 +389,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
void HexagonCommonGEP::collect() {
// Establish depth-first traversal order of the dominator tree.
ValueVect BO;
- getBlockTraversalOrder(Fn->begin(), BO);
+ getBlockTraversalOrder(&Fn->front(), BO);
// The creation of gep nodes requires DT-traversal. When processing a GEP
// instruction that uses another GEP instruction as the base pointer, the
@@ -737,7 +722,7 @@ namespace {
Instruction *In = cast<Instruction>(V);
if (In->getParent() != B)
continue;
- BasicBlock::iterator It = In;
+ BasicBlock::iterator It = In->getIterator();
if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
FirstUse = It;
}
@@ -1135,7 +1120,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
ArrayRef<Value*> A(IdxList, IdxC);
Type *InpTy = Input->getType();
Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
- NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At);
+ NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
Input = NewInst;
} while (nax <= Num);
@@ -1213,7 +1198,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
Last = Child;
} while (true);
- BasicBlock::iterator InsertAt = LastB->getTerminator();
+ BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator();
if (LastUsed || LastCN > 0) {
ValueVect Urs;
getAllUsersForNode(Root, Urs, NCM);
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
new file mode 100644
index 000000000000..ee0c318ffb5d
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -0,0 +1,1063 @@
+//===--- HexagonEarlyIfConv.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a Hexagon-specific if-conversion pass that runs on the
+// SSA form.
+// In SSA it is not straightforward to represent instructions that condi-
+// tionally define registers, since a conditionally-defined register may
+// only be used under the same condition on which the definition was based.
+// To avoid complications of this nature, this patch will only generate
+// predicated stores, and speculate other instructions from the "if-conver-
+// ted" block.
+// The code will recognize CFG patterns where a block with a conditional
+// branch "splits" into a "true block" and a "false block". Either of these
+// could be omitted (in case of a triangle, for example).
+// If after conversion of the side block(s) the CFG allows it, the resul-
+// ting blocks may be merged. If the "join" block contained PHI nodes, they
+// will be replaced with MUX (or MUX-like) instructions to maintain the
+// semantics of the PHI.
+//
+// Example:
+//
+// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead>
+// J2_jump <BB#4>, %PC<imp-def,dead>
+// Successors according to CFG: BB#4(62) BB#5(62)
+//
+// BB#4: derived from LLVM BB %if.then
+// Predecessors according to CFG: BB#3
+// %vreg11<def> = A2_addp %vreg6, %vreg10
+// S2_storerd_io %vreg32, 16, %vreg11
+// Successors according to CFG: BB#5
+//
+// BB#5: derived from LLVM BB %if.end
+// Predecessors according to CFG: BB#3 BB#4
+// %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4>
+// %vreg13<def> = A2_addp %vreg7, %vreg12
+// %vreg42<def> = C2_cmpeqi %vreg9, 10
+// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+// J2_jump <BB#6>, %PC<imp-def,dead>
+// Successors according to CFG: BB#6(4) BB#3(124)
+//
+// would become:
+//
+// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// spec-> %vreg11<def> = A2_addp %vreg6, %vreg10
+// pred-> S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11
+// %vreg46<def> = MUX64_rr %vreg41, %vreg6, %vreg11
+// %vreg13<def> = A2_addp %vreg7, %vreg46
+// %vreg42<def> = C2_cmpeqi %vreg9, 10
+// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+// J2_jump <BB#6>, %PC<imp-def,dead>
+// Successors according to CFG: BB#6 BB#3
+
+#define DEBUG_TYPE "hexagon-eif"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "HexagonTargetMachine.h"
+
+#include <functional>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonEarlyIfConversion();
+ void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry);
+}
+
+namespace {
+ cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
+ cl::init(false), cl::desc("Enable branch probability info"));
+ cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
+ cl::desc("Size limit in Hexagon early if-conversion"));
+
+ struct PrintMB {
+ PrintMB(const MachineBasicBlock *B) : MB(B) {}
+ const MachineBasicBlock *MB;
+ };
+ raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) {
+ if (!P.MB)
+ return OS << "<none>";
+ return OS << '#' << P.MB->getNumber();
+ }
+
+ struct FlowPattern {
+ FlowPattern() : SplitB(0), TrueB(0), FalseB(0), JoinB(0), PredR(0) {}
+ FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB,
+ MachineBasicBlock *FB, MachineBasicBlock *JB)
+ : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {}
+
+ MachineBasicBlock *SplitB;
+ MachineBasicBlock *TrueB, *FalseB, *JoinB;
+ unsigned PredR;
+ };
+ struct PrintFP {
+ PrintFP(const FlowPattern &P, const TargetRegisterInfo &T)
+ : FP(P), TRI(T) {}
+ const FlowPattern &FP;
+ const TargetRegisterInfo &TRI;
+ friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
+ };
+ raw_ostream &operator<<(raw_ostream &OS,
+ const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+ raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
+ OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
+ << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
+ << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
+ << PrintMB(P.FP.FalseB)
+ << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
+ return OS;
+ }
+
+ class HexagonEarlyIfConversion : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonEarlyIfConversion() : MachineFunctionPass(ID),
+ TII(0), TRI(0), MFN(0), MRI(0), MDT(0), MLI(0) {
+ initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry());
+ }
+ const char *getPassName() const override {
+ return "Hexagon early if conversion";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ typedef DenseSet<MachineBasicBlock*> BlockSetType;
+
+ bool isPreheader(const MachineBasicBlock *B) const;
+ bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L,
+ FlowPattern &FP);
+ bool visitBlock(MachineBasicBlock *B, MachineLoop *L);
+ bool visitLoop(MachineLoop *L);
+
+ bool hasEHLabel(const MachineBasicBlock *B) const;
+ bool hasUncondBranch(const MachineBasicBlock *B) const;
+ bool isValidCandidate(const MachineBasicBlock *B) const;
+ bool usesUndefVReg(const MachineInstr *MI) const;
+ bool isValid(const FlowPattern &FP) const;
+ unsigned countPredicateDefs(const MachineBasicBlock *B) const;
+ unsigned computePhiCost(MachineBasicBlock *B) const;
+ bool isProfitable(const FlowPattern &FP) const;
+ bool isPredicableStore(const MachineInstr *MI) const;
+ bool isSafeToSpeculate(const MachineInstr *MI) const;
+
+ unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
+ void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
+ MachineInstr *MI, unsigned PredR, bool IfTrue);
+ void predicateBlockNB(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+ unsigned PredR, bool IfTrue);
+
+ void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
+ void convert(const FlowPattern &FP);
+
+ void removeBlock(MachineBasicBlock *B);
+ void eliminatePhis(MachineBasicBlock *B);
+ void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
+ void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
+ void simplifyFlowGraph(const FlowPattern &FP);
+
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineFunction *MFN;
+ MachineRegisterInfo *MRI;
+ MachineDominatorTree *MDT;
+ MachineLoopInfo *MLI;
+ BlockSetType Deleted;
+ const MachineBranchProbabilityInfo *MBPI;
+ };
+
+ char HexagonEarlyIfConversion::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif",
+ "Hexagon early if conversion", false, false)
+
+bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
+ if (B->succ_size() != 1)
+ return false;
+ MachineBasicBlock *SB = *B->succ_begin();
+ MachineLoop *L = MLI->getLoopFor(SB);
+ return L && SB == L->getHeader();
+}
+
+
+bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
+ MachineLoop *L, FlowPattern &FP) {
+ DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n");
+
+ // Interested only in conditional branches, no .new, no new-value, etc.
+ // Check the terminators directly, it's easier than handling all responses
+ // from AnalyzeBranch.
+ MachineBasicBlock *TB = 0, *FB = 0;
+ MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
+ if (T1I == B->end())
+ return false;
+ unsigned Opc = T1I->getOpcode();
+ if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
+ return false;
+ unsigned PredR = T1I->getOperand(0).getReg();
+
+ // Get the layout successor, or 0 if B does not have one.
+ MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
+ MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : 0;
+
+ MachineBasicBlock *T1B = T1I->getOperand(1).getMBB();
+ MachineBasicBlock::const_iterator T2I = std::next(T1I);
+ // The second terminator should be an unconditional branch.
+ assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump);
+ MachineBasicBlock *T2B = (T2I == B->end()) ? NextB
+ : T2I->getOperand(0).getMBB();
+ if (T1B == T2B) {
+ // XXX merge if T1B == NextB, or convert branch to unconditional.
+ // mark as diamond with both sides equal?
+ return false;
+ }
+ // Loop could be null for both.
+ if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
+ return false;
+
+ // Record the true/false blocks in such a way that "true" means "if (PredR)",
+ // and "false" means "if (!PredR)".
+ if (Opc == Hexagon::J2_jumpt)
+ TB = T1B, FB = T2B;
+ else
+ TB = T2B, FB = T1B;
+
+ if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB))
+ return false;
+
+ // Detect triangle first. In case of a triangle, one of the blocks TB/FB
+ // can fall through into the other, in other words, it will be executed
+ // in both cases. We only want to predicate the block that is executed
+ // conditionally.
+ unsigned TNP = TB->pred_size(), FNP = FB->pred_size();
+ unsigned TNS = TB->succ_size(), FNS = FB->succ_size();
+
+ // A block is predicable if it has one predecessor (it must be B), and
+ // it has a single successor. In fact, the block has to end either with
+ // an unconditional branch (which can be predicated), or with a fall-
+ // through.
+ bool TOk = (TNP == 1) && (TNS == 1);
+ bool FOk = (FNP == 1) && (FNS == 1);
+
+ // If neither is predicable, there is nothing interesting.
+ if (!TOk && !FOk)
+ return false;
+
+ MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : 0;
+ MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : 0;
+ MachineBasicBlock *JB = 0;
+
+ if (TOk) {
+ if (FOk) {
+ if (TSB == FSB)
+ JB = TSB;
+ // Diamond: "if (P) then TB; else FB;".
+ } else {
+ // TOk && !FOk
+ if (TSB == FB) {
+ JB = FB;
+ FB = 0;
+ }
+ }
+ } else {
+ // !TOk && FOk (at least one must be true by now).
+ if (FSB == TB) {
+ JB = TB;
+ TB = 0;
+ }
+ }
+ // Don't try to predicate loop preheaders.
+ if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
+ DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+ << " is a loop preheader. Skipping.\n");
+ return false;
+ }
+
+ FP = FlowPattern(B, PredR, TB, FB, JB);
+ DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+ return true;
+}
+
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// contains EH_LABEL.
+bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
+ for (auto &I : *B)
+ if (I.isEHLabel())
+ return true;
+ return false;
+}
+
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// that a block can never fall-through.
+bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
+ const {
+ MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+ while (I != E) {
+ if (I->isBarrier())
+ return true;
+ ++I;
+ }
+ return false;
+}
+
+
+bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
+ const {
+ if (!B)
+ return true;
+ if (B->isEHPad() || B->hasAddressTaken())
+ return false;
+ if (B->succ_size() == 0)
+ return false;
+
+ for (auto &MI : *B) {
+ if (MI.isDebugValue())
+ continue;
+ if (MI.isConditionalBranch())
+ return false;
+ unsigned Opc = MI.getOpcode();
+ bool IsJMP = (Opc == Hexagon::J2_jump);
+ if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI))
+ return false;
+ // Look for predicate registers defined by this instruction. It's ok
+ // to speculate such an instruction, but the predicate register cannot
+ // be used outside of this block (or else it won't be possible to
+ // update the use of it after predication). PHI uses will be updated
+ // to use a result of a MUX, and a MUX cannot be created for predicate
+ // registers.
+ for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+ if (!MO->isReg() || !MO->isDef())
+ continue;
+ unsigned R = MO->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
+ continue;
+ for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
+ if (U->getParent()->isPHI())
+ return false;
+ }
+ }
+ return true;
+}
+
+
+bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
+ for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+ if (!MO->isReg() || !MO->isUse())
+ continue;
+ unsigned R = MO->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ const MachineInstr *DefI = MRI->getVRegDef(R);
+ // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
+ assert(DefI && "Expecting a reaching def in MRI");
+ if (DefI->isImplicitDef())
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
+ if (hasEHLabel(FP.SplitB)) // KLUDGE: see function definition
+ return false;
+ if (FP.TrueB && !isValidCandidate(FP.TrueB))
+ return false;
+ if (FP.FalseB && !isValidCandidate(FP.FalseB))
+ return false;
+ // Check the PHIs in the join block. If any of them use a register
+ // that is defined as IMPLICIT_DEF, do not convert this. This can
+ // legitimately happen if one side of the split never executes, but
+ // the compiler is unable to prove it. That side may then seem to
+ // provide an "undef" value to the join block, however it will never
+ // execute at run-time. If we convert this case, the "undef" will
+ // be used in a MUX instruction, and that may seem like actually
+ // using an undefined value to other optimizations. This could lead
+ // to trouble further down the optimization stream, cause assertions
+ // to fail, etc.
+ if (FP.JoinB) {
+ const MachineBasicBlock &B = *FP.JoinB;
+ for (auto &MI : B) {
+ if (!MI.isPHI())
+ break;
+ if (usesUndefVReg(&MI))
+ return false;
+ unsigned DefR = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+ if (RC == &Hexagon::PredRegsRegClass)
+ return false;
+ }
+ }
+ return true;
+}
+
+
+unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
+ assert(B->pred_size() <= 2);
+ if (B->pred_size() < 2)
+ return 0;
+
+ unsigned Cost = 0;
+ MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
+ for (I = B->begin(); I != E; ++I) {
+ const MachineOperand &RO1 = I->getOperand(1);
+ const MachineOperand &RO3 = I->getOperand(3);
+ assert(RO1.isReg() && RO3.isReg());
+ // Must have a MUX if the phi uses a subregister.
+ if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+ Cost++;
+ continue;
+ }
+ MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
+ MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+ if (!TII->isPredicable(Def1) || !TII->isPredicable(Def3))
+ Cost++;
+ }
+ return Cost;
+}
+
+
+unsigned HexagonEarlyIfConversion::countPredicateDefs(
+ const MachineBasicBlock *B) const {
+ unsigned PredDefs = 0;
+ for (auto &MI : *B) {
+ for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) {
+ if (!MO->isReg() || !MO->isDef())
+ continue;
+ unsigned R = MO->getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+ PredDefs++;
+ }
+ }
+ return PredDefs;
+}
+
+
+bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+ if (FP.TrueB && FP.FalseB) {
+
+ // Do not IfCovert if the branch is one sided.
+ if (MBPI) {
+ BranchProbability Prob(9, 10);
+ if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
+ return false;
+ if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
+ return false;
+ }
+
+ // If both sides are predicable, convert them if they join, and the
+ // join block has no other predecessors.
+ MachineBasicBlock *TSB = *FP.TrueB->succ_begin();
+ MachineBasicBlock *FSB = *FP.FalseB->succ_begin();
+ if (TSB != FSB)
+ return false;
+ if (TSB->pred_size() != 2)
+ return false;
+ }
+
+ // Calculate the total size of the predicated blocks.
+ // Assume instruction counts without branches to be the approximation of
+ // the code size. If the predicated blocks are smaller than a packet size,
+ // approximate the spare room in the packet that could be filled with the
+ // predicated/speculated instructions.
+ unsigned TS = 0, FS = 0, Spare = 0;
+ if (FP.TrueB) {
+ TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
+ if (TS < HEXAGON_PACKET_SIZE)
+ Spare += HEXAGON_PACKET_SIZE-TS;
+ }
+ if (FP.FalseB) {
+ FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
+ if (FS < HEXAGON_PACKET_SIZE)
+ Spare += HEXAGON_PACKET_SIZE-TS;
+ }
+ unsigned TotalIn = TS+FS;
+ DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
+ << TotalIn << ", spare room: " << Spare << "\n");
+ if (TotalIn >= SizeLimit+Spare)
+ return false;
+
+ // Count the number of PHI nodes that will need to be updated (converted
+ // to MUX). Those can be later converted to predicated instructions, so
+ // they aren't always adding extra cost.
+ // KLUDGE: Also, count the number of predicate register definitions in
+ // each block. The scheduler may increase the pressure of these and cause
+ // expensive spills (e.g. bitmnp01).
+ unsigned TotalPh = 0;
+ unsigned PredDefs = countPredicateDefs(FP.SplitB);
+ if (FP.JoinB) {
+ TotalPh = computePhiCost(FP.JoinB);
+ PredDefs += countPredicateDefs(FP.JoinB);
+ } else {
+ if (FP.TrueB && FP.TrueB->succ_size() > 0) {
+ MachineBasicBlock *SB = *FP.TrueB->succ_begin();
+ TotalPh += computePhiCost(SB);
+ PredDefs += countPredicateDefs(SB);
+ }
+ if (FP.FalseB && FP.FalseB->succ_size() > 0) {
+ MachineBasicBlock *SB = *FP.FalseB->succ_begin();
+ TotalPh += computePhiCost(SB);
+ PredDefs += countPredicateDefs(SB);
+ }
+ }
+ DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+ << TotalPh << "\n");
+ if (TotalIn+TotalPh >= SizeLimit+Spare)
+ return false;
+
+ DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+ if (PredDefs > 4)
+ return false;
+
+ return true;
+}
+
+
+bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
+ MachineLoop *L) {
+ bool Changed = false;
+
+ // Visit all dominated blocks from the same loop first, then process B.
+ MachineDomTreeNode *N = MDT->getNode(B);
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ // We will change CFG/DT during this traversal, so take precautions to
+ // avoid problems related to invalidated iterators. In fact, processing
+ // a child C of B cannot cause another child to be removed, but it can
+ // cause a new child to be added (which was a child of C before C itself
+ // was removed. This new child C, however, would have been processed
+ // prior to processing B, so there is no need to process it again.
+ // Simply keep a list of children of B, and traverse that list.
+ typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+ DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+ for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ if (!Deleted.count(SB))
+ Changed |= visitBlock(SB, L);
+ }
+ // When walking down the dominator tree, we want to traverse through
+ // blocks from nested (other) loops, because they can dominate blocks
+ // that are in L. Skip the non-L blocks only after the tree traversal.
+ if (MLI->getLoopFor(B) != L)
+ return Changed;
+
+ FlowPattern FP;
+ if (!matchFlowPattern(B, L, FP))
+ return Changed;
+
+ if (!isValid(FP)) {
+ DEBUG(dbgs() << "Conversion is not valid\n");
+ return Changed;
+ }
+ if (!isProfitable(FP)) {
+ DEBUG(dbgs() << "Conversion is not profitable\n");
+ return Changed;
+ }
+
+ convert(FP);
+ simplifyFlowGraph(FP);
+ return true;
+}
+
+
+bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
+ MachineBasicBlock *HB = L ? L->getHeader() : 0;
+ DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+ : dbgs() << "Visiting function") << "\n");
+ bool Changed = false;
+ if (L) {
+ for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ Changed |= visitLoop(*I);
+ }
+
+ MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
+ Changed |= visitBlock(L ? HB : EntryB, L);
+ return Changed;
+}
+
+
+bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI)
+ const {
+ // Exclude post-increment stores. Those return a value, so we cannot
+ // predicate them.
+ unsigned Opc = MI->getOpcode();
+ using namespace Hexagon;
+ switch (Opc) {
+ // Store byte:
+ case S2_storerb_io: case S4_storerb_rr:
+ case S2_storerbabs: case S4_storeirb_io: case S2_storerbgp:
+ // Store halfword:
+ case S2_storerh_io: case S4_storerh_rr:
+ case S2_storerhabs: case S4_storeirh_io: case S2_storerhgp:
+ // Store upper halfword:
+ case S2_storerf_io: case S4_storerf_rr:
+ case S2_storerfabs: case S2_storerfgp:
+ // Store word:
+ case S2_storeri_io: case S4_storeri_rr:
+ case S2_storeriabs: case S4_storeiri_io: case S2_storerigp:
+ // Store doubleword:
+ case S2_storerd_io: case S4_storerd_rr:
+ case S2_storerdabs: case S2_storerdgp:
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
+ const {
+ if (MI->mayLoad() || MI->mayStore())
+ return false;
+ if (MI->isCall() || MI->isBarrier() || MI->isBranch())
+ return false;
+ if (MI->hasUnmodeledSideEffects())
+ return false;
+
+ return true;
+}
+
+
+unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
+ bool IfTrue) const {
+ // Exclude post-increment stores.
+ using namespace Hexagon;
+ switch (Opc) {
+ case S2_storerb_io:
+ return IfTrue ? S2_pstorerbt_io : S2_pstorerbf_io;
+ case S4_storerb_rr:
+ return IfTrue ? S4_pstorerbt_rr : S4_pstorerbf_rr;
+ case S2_storerbabs:
+ case S2_storerbgp:
+ return IfTrue ? S4_pstorerbt_abs : S4_pstorerbf_abs;
+ case S4_storeirb_io:
+ return IfTrue ? S4_storeirbt_io : S4_storeirbf_io;
+ case S2_storerh_io:
+ return IfTrue ? S2_pstorerht_io : S2_pstorerhf_io;
+ case S4_storerh_rr:
+ return IfTrue ? S4_pstorerht_rr : S4_pstorerhf_rr;
+ case S2_storerhabs:
+ case S2_storerhgp:
+ return IfTrue ? S4_pstorerht_abs : S4_pstorerhf_abs;
+ case S2_storerf_io:
+ return IfTrue ? S2_pstorerft_io : S2_pstorerff_io;
+ case S4_storerf_rr:
+ return IfTrue ? S4_pstorerft_rr : S4_pstorerff_rr;
+ case S2_storerfabs:
+ case S2_storerfgp:
+ return IfTrue ? S4_pstorerft_abs : S4_pstorerff_abs;
+ case S4_storeirh_io:
+ return IfTrue ? S4_storeirht_io : S4_storeirhf_io;
+ case S2_storeri_io:
+ return IfTrue ? S2_pstorerit_io : S2_pstorerif_io;
+ case S4_storeri_rr:
+ return IfTrue ? S4_pstorerit_rr : S4_pstorerif_rr;
+ case S2_storeriabs:
+ case S2_storerigp:
+ return IfTrue ? S4_pstorerit_abs : S4_pstorerif_abs;
+ case S4_storeiri_io:
+ return IfTrue ? S4_storeirit_io : S4_storeirif_io;
+ case S2_storerd_io:
+ return IfTrue ? S2_pstorerdt_io : S2_pstorerdf_io;
+ case S4_storerd_rr:
+ return IfTrue ? S4_pstorerdt_rr : S4_pstorerdf_rr;
+ case S2_storerdabs:
+ case S2_storerdgp:
+ return IfTrue ? S4_pstorerdt_abs : S4_pstorerdf_abs;
+ }
+ llvm_unreachable("Unexpected opcode");
+ return 0;
+}
+
+
+void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineInstr *MI,
+ unsigned PredR, bool IfTrue) {
+ DebugLoc DL;
+ if (At != ToB->end())
+ DL = At->getDebugLoc();
+ else if (!ToB->empty())
+ DL = ToB->back().getDebugLoc();
+
+ unsigned Opc = MI->getOpcode();
+
+ if (isPredicableStore(MI)) {
+ unsigned COpc = getCondStoreOpcode(Opc, IfTrue);
+ assert(COpc);
+ MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, TII->get(COpc))
+ .addReg(PredR);
+ for (MIOperands MO(MI); MO.isValid(); ++MO)
+ MIB.addOperand(*MO);
+
+ // Set memory references.
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ MI->eraseFromParent();
+ return;
+ }
+
+ if (Opc == Hexagon::J2_jump) {
+ MachineBasicBlock *TB = MI->getOperand(0).getMBB();
+ const MCInstrDesc &D = TII->get(IfTrue ? Hexagon::J2_jumpt
+ : Hexagon::J2_jumpf);
+ BuildMI(*ToB, At, DL, D)
+ .addReg(PredR)
+ .addMBB(TB);
+ MI->eraseFromParent();
+ return;
+ }
+
+ // Print the offending instruction unconditionally as we are about to
+ // abort.
+ dbgs() << *MI;
+ llvm_unreachable("Unexpected instruction");
+}
+
+
+// Predicate/speculate non-branch instructions from FromB into block ToB.
+// Leave the branches alone, they will be handled later. Btw, at this point
+// FromB should have at most one branch, and it should be unconditional.
+void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
+ MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+ unsigned PredR, bool IfTrue) {
+ DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+ MachineBasicBlock::iterator End = FromB->getFirstTerminator();
+ MachineBasicBlock::iterator I, NextI;
+
+ for (I = FromB->begin(); I != End; I = NextI) {
+ assert(!I->isPHI());
+ NextI = std::next(I);
+ if (isSafeToSpeculate(&*I))
+ ToB->splice(At, FromB, I);
+ else
+ predicateInstr(ToB, At, &*I, PredR, IfTrue);
+ }
+}
+
+
+void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
+ const FlowPattern &FP) {
+ // Visit all PHI nodes in the WhereB block and generate MUX instructions
+ // in the split block. Update the PHI nodes with the values of the MUX.
+ auto NonPHI = WhereB->getFirstNonPHI();
+ for (auto I = WhereB->begin(); I != NonPHI; ++I) {
+ MachineInstr *PN = &*I;
+ // Registers and subregisters corresponding to TrueB, FalseB and SplitB.
+ unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0;
+ for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+ const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1);
+ if (BO.getMBB() == FP.SplitB)
+ SR = RO.getReg(), SSR = RO.getSubReg();
+ else if (BO.getMBB() == FP.TrueB)
+ TR = RO.getReg(), TSR = RO.getSubReg();
+ else if (BO.getMBB() == FP.FalseB)
+ FR = RO.getReg(), FSR = RO.getSubReg();
+ else
+ continue;
+ PN->RemoveOperand(i+1);
+ PN->RemoveOperand(i);
+ }
+ if (TR == 0)
+ TR = SR, TSR = SSR;
+ else if (FR == 0)
+ FR = SR, FSR = SSR;
+ assert(TR && FR);
+
+ using namespace Hexagon;
+ unsigned DR = PN->getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(DR);
+ const MCInstrDesc &D = RC == &IntRegsRegClass ? TII->get(C2_mux)
+ : TII->get(MUX64_rr);
+
+ MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
+ DebugLoc DL;
+ if (MuxAt != FP.SplitB->end())
+ DL = MuxAt->getDebugLoc();
+ unsigned MuxR = MRI->createVirtualRegister(RC);
+ BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
+ .addReg(FP.PredR)
+ .addReg(TR, 0, TSR)
+ .addReg(FR, 0, FSR);
+
+ PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+ PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
+ }
+}
+
+
+void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
+ MachineBasicBlock *TSB = 0, *FSB = 0;
+ MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator();
+ assert(OldTI != FP.SplitB->end());
+ DebugLoc DL = OldTI->getDebugLoc();
+
+ if (FP.TrueB) {
+ TSB = *FP.TrueB->succ_begin();
+ predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true);
+ }
+ if (FP.FalseB) {
+ FSB = *FP.FalseB->succ_begin();
+ MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator();
+ predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false);
+ }
+
+ // Regenerate new terminators in the split block and update the successors.
+ // First, remember any information that may be needed later and remove the
+ // existing terminators/successors from the split block.
+ MachineBasicBlock *SSB = 0;
+ FP.SplitB->erase(OldTI, FP.SplitB->end());
+ while (FP.SplitB->succ_size() > 0) {
+ MachineBasicBlock *T = *FP.SplitB->succ_begin();
+ // It's possible that the split block had a successor that is not a pre-
+ // dicated block. This could only happen if there was only one block to
+ // be predicated. Example:
+ // split_b:
+ // if (p) jump true_b
+ // jump unrelated2_b
+ // unrelated1_b:
+ // ...
+ // unrelated2_b: ; can have other predecessors, so it's not "false_b"
+ // jump other_b
+ // true_b: ; only reachable from split_b, can be predicated
+ // ...
+ //
+ // Find this successor (SSB) if it exists.
+ if (T != FP.TrueB && T != FP.FalseB) {
+ assert(!SSB);
+ SSB = T;
+ }
+ FP.SplitB->removeSuccessor(FP.SplitB->succ_begin());
+ }
+
+ // Insert new branches and update the successors of the split block. This
+ // may create unconditional branches to the layout successor, etc., but
+ // that will be cleaned up later. For now, make sure that correct code is
+ // generated.
+ if (FP.JoinB) {
+ assert(!SSB || SSB == FP.JoinB);
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump))
+ .addMBB(FP.JoinB);
+ FP.SplitB->addSuccessor(FP.JoinB);
+ } else {
+ bool HasBranch = false;
+ if (TSB) {
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jumpt))
+ .addReg(FP.PredR)
+ .addMBB(TSB);
+ FP.SplitB->addSuccessor(TSB);
+ HasBranch = true;
+ }
+ if (FSB) {
+ const MCInstrDesc &D = HasBranch ? TII->get(Hexagon::J2_jump)
+ : TII->get(Hexagon::J2_jumpf);
+ MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D);
+ if (!HasBranch)
+ MIB.addReg(FP.PredR);
+ MIB.addMBB(FSB);
+ FP.SplitB->addSuccessor(FSB);
+ }
+ if (SSB) {
+ // This cannot happen if both TSB and FSB are set. [TF]SB are the
+ // successor blocks of the TrueB and FalseB (or null of the TrueB
+ // or FalseB block is null). SSB is the potential successor block
+ // of the SplitB that is neither TrueB nor FalseB.
+ BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump))
+ .addMBB(SSB);
+ FP.SplitB->addSuccessor(SSB);
+ }
+ }
+
+ // What is left to do is to update the PHI nodes that could have entries
+ // referring to predicated blocks.
+ if (FP.JoinB) {
+ updatePhiNodes(FP.JoinB, FP);
+ } else {
+ if (TSB)
+ updatePhiNodes(TSB, FP);
+ if (FSB)
+ updatePhiNodes(FSB, FP);
+ // Nothing to update in SSB, since SSB's predecessors haven't changed.
+ }
+}
+
+
+void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
+ DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+
+ // Transfer the immediate dominator information from B to its descendants.
+ MachineDomTreeNode *N = MDT->getNode(B);
+ MachineDomTreeNode *IDN = N->getIDom();
+ if (IDN) {
+ MachineBasicBlock *IDB = IDN->getBlock();
+ typedef GraphTraits<MachineDomTreeNode*> GTN;
+ typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+ DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+ for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+ MachineBasicBlock *SB = (*I)->getBlock();
+ MDT->changeImmediateDominator(SB, IDB);
+ }
+ }
+
+ while (B->succ_size() > 0)
+ B->removeSuccessor(B->succ_begin());
+
+ for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
+ (*I)->removeSuccessor(B, true);
+
+ Deleted.insert(B);
+ MDT->eraseNode(B);
+ MFN->erase(B->getIterator());
+}
+
+
+void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
+ DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+ MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
+ for (I = B->begin(); I != NonPHI; I = NextI) {
+ NextI = std::next(I);
+ MachineInstr *PN = &*I;
+ assert(PN->getNumOperands() == 3 && "Invalid phi node");
+ MachineOperand &UO = PN->getOperand(1);
+ unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
+ unsigned DefR = PN->getOperand(0).getReg();
+ unsigned NewR = UseR;
+ if (UseSR) {
+ // MRI.replaceVregUsesWith does not allow to update the subregister,
+ // so instead of doing the use-iteration here, create a copy into a
+ // "non-subregistered" register.
+ DebugLoc DL = PN->getDebugLoc();
+ const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+ NewR = MRI->createVirtualRegister(RC);
+ NonPHI = BuildMI(*B, NonPHI, DL, TII->get(TargetOpcode::COPY), NewR)
+ .addReg(UseR, 0, UseSR);
+ }
+ MRI->replaceRegWith(DefR, NewR);
+ B->erase(I);
+ }
+}
+
+
+void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
+ MachineBasicBlock *NewB) {
+ for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
+ MachineBasicBlock *SB = *I;
+ MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
+ for (P = SB->begin(); P != N; ++P) {
+ MachineInstr *PN = &*P;
+ for (MIOperands MO(PN); MO.isValid(); ++MO)
+ if (MO->isMBB() && MO->getMBB() == OldB)
+ MO->setMBB(NewB);
+ }
+ }
+}
+
+
+void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
+ MachineBasicBlock *SuccB) {
+ DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+ << PrintMB(SuccB) << "\n");
+ bool TermOk = hasUncondBranch(SuccB);
+ eliminatePhis(SuccB);
+ TII->RemoveBranch(*PredB);
+ PredB->removeSuccessor(SuccB);
+ PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
+ MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
+ for (I = SuccB->succ_begin(); I != E; ++I)
+ PredB->addSuccessor(*I);
+ PredB->normalizeSuccProbs();
+ replacePhiEdges(SuccB, PredB);
+ removeBlock(SuccB);
+ if (!TermOk)
+ PredB->updateTerminator();
+}
+
+
+void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
+ if (FP.TrueB)
+ removeBlock(FP.TrueB);
+ if (FP.FalseB)
+ removeBlock(FP.FalseB);
+
+ FP.SplitB->updateTerminator();
+ if (FP.SplitB->succ_size() != 1)
+ return;
+
+ MachineBasicBlock *SB = *FP.SplitB->succ_begin();
+ if (SB->pred_size() != 1)
+ return;
+
+ // By now, the split block has only one successor (SB), and SB has only
+ // one predecessor. We can try to merge them. We will need to update ter-
+ // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+ // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
+ // with an unconditional branch, we won't need to touch the terminators.
+ if (!hasEHLabel(SB) || hasUncondBranch(SB))
+ mergeBlocks(FP.SplitB, SB);
+}
+
+
+bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
+ auto &ST = MF.getSubtarget();
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MFN = &MF;
+ MRI = &MF.getRegInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
+ nullptr;
+
+ Deleted.clear();
+ bool Changed = false;
+
+ for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
+ Changed |= visitLoop(*I);
+ Changed |= visitLoop(0);
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+FunctionPass *llvm::createHexagonEarlyIfConversion() {
+ return new HexagonEarlyIfConversion();
+}
+
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index e4c8d8f7b28c..6e2dbc06b124 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -74,7 +74,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
// Loop over all of the basic blocks.
for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
// Traverse the basic block.
for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
++MII) {
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 21a8996b1159..7a52a1c9eaec 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -147,6 +147,48 @@ static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit", cl::init(UINT_MAX),
cl::Hidden, cl::ZeroOrMore, cl::desc("Max count of stack frame "
"shrink-wraps"));
+static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
+ cl::Hidden, cl::desc("Use allocframe more conservatively"));
+
+
+namespace llvm {
+ void initializeHexagonCallFrameInformationPass(PassRegistry&);
+ FunctionPass *createHexagonCallFrameInformation();
+}
+
+namespace {
+ class HexagonCallFrameInformation : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonCallFrameInformation() : MachineFunctionPass(ID) {
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeHexagonCallFrameInformationPass(PR);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ };
+
+ char HexagonCallFrameInformation::ID = 0;
+}
+
+bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) {
+ auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering();
+ bool NeedCFI = MF.getMMI().hasDebugInfo() ||
+ MF.getFunction()->needsUnwindTableEntry();
+
+ if (!NeedCFI)
+ return false;
+ HFI.insertCFIInstructions(MF);
+ return true;
+}
+
+INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi",
+ "Hexagon call frame information", false, false)
+
+FunctionPass *llvm::createHexagonCallFrameInformation() {
+ return new HexagonCallFrameInformation();
+}
+
+
namespace {
/// Map a register pair Reg to the subregister that has the greater "number",
/// i.e. D3 (aka R7:6) will be mapped to R7, etc.
@@ -370,11 +412,11 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
insertEpilogueInBlock(*EpilogB);
} else {
for (auto &B : MF)
- if (!B.empty() && B.back().isReturn())
+ if (B.isReturnBlock())
insertCSRRestoresInBlock(B, CSI, HRI);
for (auto &B : MF)
- if (!B.empty() && B.back().isReturn())
+ if (B.isReturnBlock())
insertEpilogueInBlock(B);
}
}
@@ -383,10 +425,7 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo *MFI = MF.getFrameInfo();
- MachineModuleInfo &MMI = MF.getMMI();
- MachineBasicBlock::iterator MBBI = MBB.begin();
- auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
- auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
auto &HII = *HST.getInstrInfo();
auto &HRI = *HST.getRegisterInfo();
DebugLoc dl;
@@ -405,10 +444,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
bool AlignStack = (MaxAlign > getStackAlignment());
- // Check if frame moves are needed for EH.
- bool needsFrameMoves = MMI.hasDebugInfo() ||
- MF.getFunction()->needsUnwindTableEntry();
-
// Get the number of bytes to allocate from the FrameInfo.
unsigned NumBytes = MFI->getStackSize();
unsigned SP = HRI.getStackRegister();
@@ -424,14 +459,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
MI->eraseFromParent();
}
- //
- // Only insert ALLOCFRAME if we need to or at -O0 for the debugger. Think
- // that this shouldn't be required, but doing so now because gcc does and
- // gdb can't break at the start of the function without it. Will remove if
- // this turns out to be a gdb bug.
- //
- bool NoOpt = (HTM.getOptLevel() == CodeGenOpt::None);
- if (!NoOpt && !FuncInfo->hasClobberLR() && !hasFP(MF))
+ if (!hasFP(MF))
return;
// Check for overflow.
@@ -469,92 +497,11 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const {
.addReg(SP)
.addImm(-int64_t(MaxAlign));
}
-
- if (needsFrameMoves) {
- std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions();
- MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
- // Advance CFA. DW_CFA_def_cfa
- unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
- unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
-
- // CFA = FP + 8
- unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
- FrameLabel, DwFPReg, -8));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
-
- // R31 (return addr) = CFA - #4
- CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
- FrameLabel, DwRAReg, -4));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
-
- // R30 (frame ptr) = CFA - #8)
- CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
- FrameLabel, DwFPReg, -8));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
-
- unsigned int regsToMove[] = {
- Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2,
- Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
- Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
- Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
- Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9, Hexagon::D10,
- Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::NoRegister
- };
-
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
- for (unsigned i = 0; regsToMove[i] != Hexagon::NoRegister; ++i) {
- for (unsigned I = 0, E = CSI.size(); I < E; ++I) {
- if (CSI[I].getReg() == regsToMove[i]) {
- // Subtract 8 to make room for R30 and R31, which are added above.
- int64_t Offset = getFrameIndexOffset(MF, CSI[I].getFrameIdx()) - 8;
-
- if (regsToMove[i] < Hexagon::D0 || regsToMove[i] > Hexagon::D15) {
- unsigned DwarfReg = HRI.getDwarfRegNum(regsToMove[i], true);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(FrameLabel,
- DwarfReg, Offset));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- } else {
- // Split the double regs into subregs, and generate appropriate
- // cfi_offsets.
- // The only reason, we are split double regs is, llvm-mc does not
- // understand paired registers for cfi_offset.
- // Eg .cfi_offset r1:0, -64
- unsigned HiReg = getMax32BitSubRegister(regsToMove[i], HRI);
- unsigned LoReg = getMax32BitSubRegister(regsToMove[i], HRI, false);
- unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
- unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
- unsigned HiCFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(FrameLabel,
- HiDwarfReg, Offset+4));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(HiCFIIndex);
- unsigned LoCFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(FrameLabel,
- LoDwarfReg, Offset));
- BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(LoCFIIndex);
- }
- break;
- }
- } // for CSI.size()
- } // for regsToMove
- } // needsFrameMoves
}
void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
MachineFunction &MF = *MBB.getParent();
- //
- // Only insert deallocframe if we need to. Also at -O0. See comment
- // in insertPrologueInBlock above.
- //
- if (!hasFP(MF) && MF.getTarget().getOptLevel() != CodeGenOpt::None)
+ if (!hasFP(MF))
return;
auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
@@ -630,12 +577,172 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
}
+namespace {
+ bool IsAllocFrame(MachineBasicBlock::const_iterator It) {
+ if (!It->isBundle())
+ return It->getOpcode() == Hexagon::S2_allocframe;
+ auto End = It->getParent()->instr_end();
+ MachineBasicBlock::const_instr_iterator I = It.getInstrIterator();
+ while (++I != End && I->isBundled())
+ if (I->getOpcode() == Hexagon::S2_allocframe)
+ return true;
+ return false;
+ }
+
+ MachineBasicBlock::iterator FindAllocFrame(MachineBasicBlock &B) {
+ for (auto &I : B)
+ if (IsAllocFrame(I))
+ return I;
+ return B.end();
+ }
+}
+
+
+void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
+ for (auto &B : MF) {
+ auto AF = FindAllocFrame(B);
+ if (AF == B.end())
+ continue;
+ insertCFIInstructionsAt(B, ++AF);
+ }
+}
+
+
+void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator At) const {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ auto &HII = *HST.getInstrInfo();
+ auto &HRI = *HST.getRegisterInfo();
+
+ // If CFI instructions have debug information attached, something goes
+ // wrong with the final assembly generation: the prolog_end is placed
+ // in a wrong location.
+ DebugLoc DL;
+ const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
+
+ MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+
+ if (hasFP(MF)) {
+ unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
+ unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
+
+ // Define CFA via an offset from the value of FP.
+ //
+ // -8 -4 0 (SP)
+ // --+----+----+---------------------
+ // | FP | LR | increasing addresses -->
+ // --+----+----+---------------------
+ // | +-- Old SP (before allocframe)
+ // +-- New FP (after allocframe)
+ //
+ // MCCFIInstruction::createDefCfa subtracts the offset from the register.
+ // MCCFIInstruction::createOffset takes the offset without sign change.
+ auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(DefCfa));
+ // R31 (return addr) = CFA - 4
+ auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(OffR31));
+ // R30 (frame ptr) = CFA - 8
+ auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(OffR30));
+ }
+
+ static unsigned int RegsToMove[] = {
+ Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2,
+ Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
+ Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
+ Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
+ Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9,
+ Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13,
+ Hexagon::NoRegister
+ };
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+ for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) {
+ unsigned Reg = RegsToMove[i];
+ auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool {
+ return C.getReg() == Reg;
+ };
+ auto F = std::find_if(CSI.begin(), CSI.end(), IfR);
+ if (F == CSI.end())
+ continue;
+
+ // Subtract 8 to make room for R30 and R31, which are added above.
+ unsigned FrameReg;
+ int64_t Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg) - 8;
+
+ if (Reg < Hexagon::D0 || Reg > Hexagon::D15) {
+ unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true);
+ auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg,
+ Offset);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(OffReg));
+ } else {
+ // Split the double regs into subregs, and generate appropriate
+ // cfi_offsets.
+ // The only reason, we are split double regs is, llvm-mc does not
+ // understand paired registers for cfi_offset.
+ // Eg .cfi_offset r1:0, -64
+
+ unsigned HiReg = HRI.getSubReg(Reg, Hexagon::subreg_hireg);
+ unsigned LoReg = HRI.getSubReg(Reg, Hexagon::subreg_loreg);
+ unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
+ unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
+ auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
+ Offset+4);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(OffHi));
+ auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg,
+ Offset);
+ BuildMI(MBB, At, DL, CFID)
+ .addCFIIndex(MMI.addFrameInst(OffLo));
+ }
+ }
+}
+
+
bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const HexagonMachineFunctionInfo *FuncInfo =
- MF.getInfo<HexagonMachineFunctionInfo>();
- return MFI->hasCalls() || MFI->getStackSize() > 0 ||
- FuncInfo->hasClobberLR();
+ auto &MFI = *MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+ bool HasFixed = MFI.getNumFixedObjects();
+ bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
+ .getLocalFrameObjectCount();
+ bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool HasAlloca = MFI.hasVarSizedObjects();
+
+ // Insert ALLOCFRAME if we need to or at -O0 for the debugger. Think
+ // that this shouldn't be required, but doing so now because gcc does and
+ // gdb can't break at the start of the function without it. Will remove if
+ // this turns out to be a gdb bug.
+ //
+ if (MF.getTarget().getOptLevel() == CodeGenOpt::None)
+ return true;
+
+ // By default we want to use SP (since it's always there). FP requires
+ // some setup (i.e. ALLOCFRAME).
+ // Fixed and preallocated objects need FP if the distance from them to
+ // the SP is unknown (as is with alloca or aligna).
+ if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+ return true;
+
+ if (MFI.getStackSize() > 0) {
+ if (UseAllocframe)
+ return true;
+ }
+
+ if (MFI.hasCalls() ||
+ MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+ return true;
+
+ return false;
}
@@ -718,9 +825,89 @@ static void addCalleeSaveRegistersAsImpOperand(MachineInstr *Inst,
}
-int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
- return MF.getFrameInfo()->getObjectOffset(FI);
+int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI, unsigned &FrameReg) const {
+ auto &MFI = *MF.getFrameInfo();
+ auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+ // Large parts of this code are shared with HRI::eliminateFrameIndex.
+ int Offset = MFI.getObjectOffset(FI);
+ bool HasAlloca = MFI.hasVarSizedObjects();
+ bool HasExtraAlign = HRI.needsStackRealignment(MF);
+ bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
+
+ unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
+ unsigned AP = 0;
+ if (const MachineInstr *AI = getAlignaInstr(MF))
+ AP = AI->getOperand(0).getReg();
+ unsigned FrameSize = MFI.getStackSize();
+
+ bool UseFP = false, UseAP = false; // Default: use SP (except at -O0).
+ // Use FP at -O0, except when there are objects with extra alignment.
+ // That additional alignment requirement may cause a pad to be inserted,
+ // which will make it impossible to use FP to access objects located
+ // past the pad.
+ if (NoOpt && !HasExtraAlign)
+ UseFP = true;
+ if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
+ // Fixed and preallocated objects will be located before any padding
+ // so FP must be used to access them.
+ UseFP |= (HasAlloca || HasExtraAlign);
+ } else {
+ if (HasAlloca) {
+ if (HasExtraAlign)
+ UseAP = true;
+ else
+ UseFP = true;
+ }
+ }
+
+ // If FP was picked, then there had better be FP.
+ bool HasFP = hasFP(MF);
+ assert((HasFP || !UseFP) && "This function must have frame pointer");
+
+ // Having FP implies allocframe. Allocframe will store extra 8 bytes:
+ // FP/LR. If the base register is used to access an object across these
+ // 8 bytes, then the offset will need to be adjusted by 8.
+ //
+ // After allocframe:
+ // HexagonISelLowering adds 8 to ---+
+ // the offsets of all stack-based |
+ // arguments (*) |
+ // |
+ // getObjectOffset < 0 0 8 getObjectOffset >= 8
+ // ------------------------+-----+------------------------> increasing
+ // <local objects> |FP/LR| <input arguments> addresses
+ // -----------------+------+-----+------------------------>
+ // | |
+ // SP/AP point --+ +-- FP points here (**)
+ // somewhere on
+ // this side of FP/LR
+ //
+ // (*) See LowerFormalArguments. The FP/LR is assumed to be present.
+ // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR.
+
+ // The lowering assumes that FP/LR is present, and so the offsets of
+ // the formal arguments start at 8. If FP/LR is not there we need to
+ // reduce the offset by 8.
+ if (Offset > 0 && !HasFP)
+ Offset -= 8;
+
+ if (UseFP)
+ FrameReg = FP;
+ else if (UseAP)
+ FrameReg = AP;
+ else
+ FrameReg = SP;
+
+ // Calculate the actual offset in the instruction. If there is no FP
+ // (in other words, no allocframe), then SP will not be adjusted (i.e.
+ // there will be no SP -= FrameSize), so the frame size should not be
+ // added to the calculated offset.
+ int RealOffset = Offset;
+ if (!UseFP && !UseAP && HasFP)
+ RealOffset = FrameSize+Offset;
+ return RealOffset;
}
@@ -731,7 +918,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI = MBB.begin();
MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
if (useSpillFunction(MF, CSI)) {
unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI);
@@ -739,7 +926,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
// Call spill function.
DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
MachineInstr *SaveRegsCall =
- BuildMI(MBB, MI, DL, TII.get(Hexagon::SAVE_REGISTERS_CALL_V4))
+ BuildMI(MBB, MI, DL, HII.get(Hexagon::SAVE_REGISTERS_CALL_V4))
.addExternalSymbol(SpillFun);
// Add callee-saved registers as use.
addCalleeSaveRegistersAsImpOperand(SaveRegsCall, MaxReg, false);
@@ -757,7 +944,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
int FI = CSI[i].getFrameIdx();
const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
- TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
+ HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
if (IsKill)
MBB.addLiveIn(Reg);
}
@@ -772,7 +959,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
if (useRestoreFunction(MF, CSI)) {
bool HasTC = hasTailCall(MBB) || !hasReturn(MBB);
@@ -787,14 +974,14 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
if (HasTC) {
unsigned ROpc = Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
- DeallocCall = BuildMI(MBB, MI, DL, TII.get(ROpc))
+ DeallocCall = BuildMI(MBB, MI, DL, HII.get(ROpc))
.addExternalSymbol(RestoreFn);
} else {
// The block has a return.
MachineBasicBlock::iterator It = MBB.getFirstTerminator();
assert(It->isReturn() && std::next(It) == MBB.end());
unsigned ROpc = Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
- DeallocCall = BuildMI(MBB, It, DL, TII.get(ROpc))
+ DeallocCall = BuildMI(MBB, It, DL, HII.get(ROpc))
.addExternalSymbol(RestoreFn);
// Transfer the function live-out registers.
DeallocCall->copyImplicitOps(MF, It);
@@ -807,7 +994,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
unsigned Reg = CSI[i].getReg();
const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
int FI = CSI[i].getFrameIdx();
- TII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
+ HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
}
return true;
}
@@ -832,9 +1019,9 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
// via AP, which may not be available at the particular place in the program.
MachineFrameInfo *MFI = MF.getFrameInfo();
bool HasAlloca = MFI->hasVarSizedObjects();
- bool HasAligna = (MFI->getMaxAlignment() > getStackAlignment());
+ bool NeedsAlign = (MFI->getMaxAlignment() > getStackAlignment());
- if (!HasAlloca || !HasAligna)
+ if (!HasAlloca || !NeedsAlign)
return;
unsigned LFS = MFI->getLocalFrameSize();
@@ -864,13 +1051,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
// Check for an unused caller-saved register.
for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
MCPhysReg FreeReg = *CallerSavedRegs;
- if (MRI.isPhysRegUsed(FreeReg))
+ if (!MRI.reg_nodbg_empty(FreeReg))
continue;
// Check aliased register usage.
bool IsCurrentRegUsed = false;
for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
- if (MRI.isPhysRegUsed(*AI)) {
+ if (!MRI.reg_nodbg_empty(*AI)) {
IsCurrentRegUsed = true;
break;
}
@@ -896,7 +1083,7 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF)
// Loop over all of the basic blocks.
for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
// Traverse the basic block.
MachineBasicBlock::iterator NextII;
for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
@@ -1210,7 +1397,8 @@ bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const {
}
-MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const {
+const MachineInstr *HexagonFrameLowering::getAlignaInstr(
+ const MachineFunction &MF) const {
for (auto &B : MF)
for (auto &I : B)
if (I.getOpcode() == Hexagon::ALIGNA)
@@ -1219,6 +1407,7 @@ MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const {
}
+// FIXME: Use Function::optForSize().
inline static bool isOptSize(const MachineFunction &MF) {
AttributeSet AF = MF.getFunction()->getAttributes();
return AF.hasAttribute(AttributeSet::FunctionIndex,
@@ -1226,8 +1415,7 @@ inline static bool isOptSize(const MachineFunction &MF) {
}
inline static bool isMinSize(const MachineFunction &MF) {
- AttributeSet AF = MF.getFunction()->getAttributes();
- return AF.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ return MF.getFunction()->optForMinSize();
}
@@ -1289,4 +1477,3 @@ bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
: SpillFuncThreshold;
return Threshold < NumCSI;
}
-
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index d39ee2c77195..683b303d43ea 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -51,7 +51,8 @@ public:
bool targetHandlesStackFrameRounding() const override {
return true;
}
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
bool hasFP(const MachineFunction &MF) const override;
const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
@@ -73,7 +74,9 @@ public:
const override;
bool needsAligna(const MachineFunction &MF) const;
- MachineInstr *getAlignaInstr(MachineFunction &MF) const;
+ const MachineInstr *getAlignaInstr(const MachineFunction &MF) const;
+
+ void insertCFIInstructions(MachineFunction &MF) const;
private:
typedef std::vector<CalleeSavedInfo> CSIVect;
@@ -86,6 +89,8 @@ private:
const HexagonRegisterInfo &HRI) const;
bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
const HexagonRegisterInfo &HRI) const;
+ void insertCFIInstructionsAt(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator At) const;
void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const;
bool replacePredRegPseudoSpillCode(MachineFunction &MF) const;
@@ -94,7 +99,7 @@ private:
void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB,
MachineBasicBlock *&EpilogB) const;
- bool shouldInlineCSR(llvm::MachineFunction&, const CSIVect&) const;
+ bool shouldInlineCSR(llvm::MachineFunction &MF, const CSIVect &CSI) const;
bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
};
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index 4d32208bd5aa..f26e2ff764d7 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -195,7 +195,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
return false;
}
- IRBuilder<> IRB(BB, In);
+ IRBuilder<> IRB(In);
Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
: Intrinsic::hexagon_S2_extractup;
Module *Mod = BB->getParent()->getParent();
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 096da949e77b..64a2b6cec18a 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -77,9 +77,8 @@ namespace {
namespace {
// Set of virtual registers, based on BitVector.
struct RegisterSet : private BitVector {
- RegisterSet() : BitVector() {}
+ RegisterSet() = default;
explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
- RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
using BitVector::clear;
@@ -1496,7 +1495,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
// version of DCE that preserves lifetime markers. Without it, merging
// of stack objects can fail to recognize and merge disjoint objects
// leading to unnecessary stack growth.
- Changed |= removeDeadCode(MDT->getRootNode());
+ Changed = removeDeadCode(MDT->getRootNode());
const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
BitTracker BTLoc(HE, MF);
@@ -1534,7 +1533,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
}
if (IFMap.empty())
- return false;
+ return Changed;
{
NamedRegionTimer _T("pruning", "hexinsert", TimingDetail);
@@ -1547,7 +1546,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
}
if (IFMap.empty())
- return false;
+ return Changed;
{
NamedRegionTimer _T("selection", "hexinsert", TimingDetail);
@@ -1572,13 +1571,15 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
for (unsigned i = 0, n = Out.size(); i < n; ++i)
IFMap.erase(Out[i]);
}
+ if (IFMap.empty())
+ return Changed;
{
NamedRegionTimer _T("generation", "hexinsert", TimingDetail);
- Changed = generateInserts();
+ generateInserts();
}
- return Changed;
+ return true;
}
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
new file mode 100644
index 000000000000..c059d566709e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -0,0 +1,319 @@
+//===--- HexagonGenMux.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// During instruction selection, MUX instructions are generated for
+// conditional assignments. Since such assignments often present an
+// opportunity to predicate instructions, HexagonExpandCondsets
+// expands MUXes into pairs of conditional transfers, and then proceeds
+// with predication of the producers/consumers of the registers involved.
+// This happens after exiting from the SSA form, but before the machine
+// instruction scheduler. After the scheduler and after the register
+// allocation there can be cases of pairs of conditional transfers
+// resulting from a MUX where neither of them was further predicated. If
+// these transfers are now placed far enough from the instruction defining
+// the predicate register, they cannot use the .new form. In such cases it
+// is better to collapse them back to a single MUX instruction.
+
+#define DEBUG_TYPE "hexmux"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonGenMux();
+ void initializeHexagonGenMuxPass(PassRegistry& Registry);
+}
+
+namespace {
+ class HexagonGenMux : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonGenMux() : MachineFunctionPass(ID), HII(0), HRI(0) {
+ initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
+ }
+ const char *getPassName() const override {
+ return "Hexagon generate mux instructions";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+ struct CondsetInfo {
+ unsigned PredR;
+ unsigned TrueX, FalseX;
+ CondsetInfo() : PredR(0), TrueX(UINT_MAX), FalseX(UINT_MAX) {}
+ };
+ struct DefUseInfo {
+ BitVector Defs, Uses;
+ DefUseInfo() : Defs(), Uses() {}
+ DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {}
+ };
+ struct MuxInfo {
+ MachineBasicBlock::iterator At;
+ unsigned DefR, PredR;
+ MachineOperand *SrcT, *SrcF;
+ MachineInstr *Def1, *Def2;
+ MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR,
+ MachineOperand *TOp, MachineOperand *FOp,
+ MachineInstr *D1, MachineInstr *D2)
+ : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(D1),
+ Def2(D2) {}
+ };
+ typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap;
+ typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap;
+ typedef SmallVector<MuxInfo,4> MuxInfoList;
+
+ bool isRegPair(unsigned Reg) const {
+ return Hexagon::DoubleRegsRegClass.contains(Reg);
+ }
+ void getSubRegs(unsigned Reg, BitVector &SRs) const;
+ void expandReg(unsigned Reg, BitVector &Set) const;
+ void getDefsUses(const MachineInstr *MI, BitVector &Defs,
+ BitVector &Uses) const;
+ void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+ DefUseInfoMap &DUM);
+ bool isCondTransfer(unsigned Opc) const;
+ unsigned getMuxOpcode(const MachineOperand &Src1,
+ const MachineOperand &Src2) const;
+ bool genMuxInBlock(MachineBasicBlock &B);
+ };
+
+ char HexagonGenMux::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+ "Hexagon generate mux instructions", false, false)
+
+
+void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
+ for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I)
+ SRs[*I] = true;
+}
+
+
+void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const {
+ if (isRegPair(Reg))
+ getSubRegs(Reg, Set);
+ else
+ Set[Reg] = true;
+}
+
+
+void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
+ BitVector &Uses) const {
+ // First, get the implicit defs and uses for this instruction.
+ unsigned Opc = MI->getOpcode();
+ const MCInstrDesc &D = HII->get(Opc);
+ if (const MCPhysReg *R = D.ImplicitDefs)
+ while (*R)
+ expandReg(*R++, Defs);
+ if (const MCPhysReg *R = D.ImplicitUses)
+ while (*R)
+ expandReg(*R++, Uses);
+
+ // Look over all operands, and collect explicit defs and uses.
+ for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) {
+ if (!Mo->isReg() || Mo->isImplicit())
+ continue;
+ unsigned R = Mo->getReg();
+ BitVector &Set = Mo->isDef() ? Defs : Uses;
+ expandReg(R, Set);
+ }
+}
+
+
+void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+ DefUseInfoMap &DUM) {
+ unsigned Index = 0;
+ unsigned NR = HRI->getNumRegs();
+ BitVector Defs(NR), Uses(NR);
+
+ for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ I2X.insert(std::make_pair(MI, Index));
+ Defs.reset();
+ Uses.reset();
+ getDefsUses(MI, Defs, Uses);
+ DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses)));
+ Index++;
+ }
+}
+
+
+bool HexagonGenMux::isCondTransfer(unsigned Opc) const {
+ switch (Opc) {
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmoveif:
+ return true;
+ }
+ return false;
+}
+
+
+unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1,
+ const MachineOperand &Src2) const {
+ bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg();
+ if (IsReg1)
+ return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir;
+ if (IsReg2)
+ return Hexagon::C2_muxri;
+
+ // Neither is a register. The first source is extendable, but the second
+ // is not (s8).
+ if (Src2.isImm() && isInt<8>(Src2.getImm()))
+ return Hexagon::C2_muxii;
+
+ return 0;
+}
+
+
+bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
+ bool Changed = false;
+ InstrIndexMap I2X;
+ DefUseInfoMap DUM;
+ buildMaps(B, I2X, DUM);
+
+ typedef DenseMap<unsigned,CondsetInfo> CondsetMap;
+ CondsetMap CM;
+ MuxInfoList ML;
+
+ MachineBasicBlock::iterator NextI, End = B.end();
+ for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) {
+ MachineInstr *MI = &*I;
+ NextI = std::next(I);
+ unsigned Opc = MI->getOpcode();
+ if (!isCondTransfer(Opc))
+ continue;
+ unsigned DR = MI->getOperand(0).getReg();
+ if (isRegPair(DR))
+ continue;
+
+ unsigned PR = MI->getOperand(1).getReg();
+ unsigned Idx = I2X.lookup(MI);
+ CondsetMap::iterator F = CM.find(DR);
+ bool IfTrue = HII->isPredicatedTrue(Opc);
+
+ // If there is no record of a conditional transfer for this register,
+ // or the predicate register differs, create a new record for it.
+ if (F != CM.end() && F->second.PredR != PR) {
+ CM.erase(F);
+ F = CM.end();
+ }
+ if (F == CM.end()) {
+ auto It = CM.insert(std::make_pair(DR, CondsetInfo()));
+ F = It.first;
+ F->second.PredR = PR;
+ }
+ CondsetInfo &CI = F->second;
+ if (IfTrue)
+ CI.TrueX = Idx;
+ else
+ CI.FalseX = Idx;
+ if (CI.TrueX == UINT_MAX || CI.FalseX == UINT_MAX)
+ continue;
+
+ // There is now a complete definition of DR, i.e. we have the predicate
+ // register, the definition if-true, and definition if-false.
+
+ // First, check if both definitions are far enough from the definition
+ // of the predicate register.
+ unsigned MinX = std::min(CI.TrueX, CI.FalseX);
+ unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
+ unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+ bool NearDef = false;
+ for (unsigned X = SearchX; X < MaxX; ++X) {
+ const DefUseInfo &DU = DUM.lookup(X);
+ if (!DU.Defs[PR])
+ continue;
+ NearDef = true;
+ break;
+ }
+ if (NearDef)
+ continue;
+
+ // The predicate register is not defined in the last few instructions.
+ // Check if the conversion to MUX is possible (either "up", i.e. at the
+ // place of the earlier partial definition, or "down", where the later
+ // definition is located). Examine all defs and uses between these two
+ // definitions.
+ // SR1, SR2 - source registers from the first and the second definition.
+ MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin();
+ std::advance(It1, MinX);
+ std::advance(It2, MaxX);
+ MachineInstr *Def1 = It1, *Def2 = It2;
+ MachineOperand *Src1 = &Def1->getOperand(2), *Src2 = &Def2->getOperand(2);
+ unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
+ unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
+ bool Failure = false, CanUp = true, CanDown = true;
+ for (unsigned X = MinX+1; X < MaxX; X++) {
+ const DefUseInfo &DU = DUM.lookup(X);
+ if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
+ Failure = true;
+ break;
+ }
+ if (CanDown && DU.Defs[SR1])
+ CanDown = false;
+ if (CanUp && DU.Defs[SR2])
+ CanUp = false;
+ }
+ if (Failure || (!CanUp && !CanDown))
+ continue;
+
+ MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2;
+ MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2;
+ // Prefer "down", since this will move the MUX farther away from the
+ // predicate definition.
+ MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
+ ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
+ }
+
+ for (unsigned I = 0, N = ML.size(); I < N; ++I) {
+ MuxInfo &MX = ML[I];
+ MachineBasicBlock &B = *MX.At->getParent();
+ DebugLoc DL = MX.At->getDebugLoc();
+ unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
+ if (!MxOpc)
+ continue;
+ BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+ .addReg(MX.PredR)
+ .addOperand(*MX.SrcT)
+ .addOperand(*MX.SrcF);
+ B.erase(MX.Def1);
+ B.erase(MX.Def2);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ bool Changed = false;
+ for (auto &I : MF)
+ Changed |= genMuxInBlock(I);
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenMux() {
+ return new HexagonGenMux();
+}
+
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 6905c4f6d125..d9675b5173d2 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -250,7 +250,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
unsigned NewPR = MRI->createVirtualRegister(PredRC);
// For convertible instructions, do not modify them, so that they can
- // be coverted later. Generate a copy from Reg to NewPR.
+ // be converted later. Generate a copy from Reg to NewPR.
if (isConvertibleToPredForm(DefI)) {
MachineBasicBlock::iterator DefIt = DefI;
BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 53b6bf617e8f..d20a809d6c09 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -727,9 +727,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
// Phis that may feed into the loop.
LoopFeederMap LoopFeederPhi;
- // Check if the inital value may be zero and can be decremented in the first
+ // Check if the initial value may be zero and can be decremented in the first
// iteration. If the value is zero, the endloop instruction will not decrement
- // the loop counter, so we shoudn't generate a hardware loop in this case.
+ // the loop counter, so we shouldn't generate a hardware loop in this case.
if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop,
LoopFeederPhi))
return nullptr;
@@ -1288,14 +1288,14 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
typedef MachineBasicBlock::instr_iterator instr_iterator;
// Check if things are in order to begin with.
- for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I)
+ for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I)
if (&*I == CmpI)
return true;
// Out of order.
unsigned PredR = CmpI->getOperand(0).getReg();
bool FoundBump = false;
- instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt);
+ instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
MachineInstr *In = &*I;
for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
@@ -1307,9 +1307,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
}
if (In == BumpI) {
- instr_iterator After = BumpI;
- instr_iterator From = CmpI;
- BB->splice(std::next(After), BB, From);
+ BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator());
FoundBump = true;
break;
}
@@ -1440,7 +1438,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
if (Comparison::isSigned(Cmp))
return false;
- // Check if there is a comparison of the inital value. If the initial value
+ // Check if there is a comparison of the initial value. If the initial value
// is greater than or not equal to another value, then assume this is a
// range check.
if ((Cmp & Comparison::G) || Cmp == Comparison::NE)
@@ -1850,7 +1848,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
}
MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
- MF->insert(Header, NewPH);
+ MF->insert(Header->getIterator(), NewPH);
if (Header->pred_size() > 2) {
// Ensure that the header has only two predecessors: the preheader and
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9123057e60d1..a0da945e7572 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -50,16 +50,21 @@ namespace {
class HexagonDAGToDAGISel : public SelectionDAGISel {
const HexagonTargetMachine& HTM;
const HexagonSubtarget *HST;
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
public:
explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), HTM(tm) {
+ : SelectionDAGISel(tm, OptLevel), HTM(tm), HST(nullptr), HII(nullptr),
+ HRI(nullptr) {
initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override {
// Reset the subtarget each time through.
HST = &MF.getSubtarget<HexagonSubtarget>();
+ HII = HST->getInstrInfo();
+ HRI = HST->getRegisterInfo();
SelectionDAGISel::runOnMachineFunction(MF);
return true;
}
@@ -104,7 +109,6 @@ public:
SDNode *SelectConstantFP(SDNode *N);
SDNode *SelectAdd(SDNode *N);
SDNode *SelectBitOp(SDNode *N);
- bool isConstExtProfitable(SDNode *N) const;
// XformMskToBitPosU5Imm - Returns the bit position which
// the single bit 32 bit mask represents.
@@ -139,8 +143,8 @@ public:
// type i32 where the negative literal is transformed into a positive literal
// for use in -= memops.
inline SDValue XformM5ToU5Imm(signed Imm, SDLoc DL) {
- assert( (Imm >= -31 && Imm <= -1) && "Constant out of range for Memops");
- return CurDAG->getTargetConstant( - Imm, DL, MVT::i32);
+ assert((Imm >= -31 && Imm <= -1) && "Constant out of range for Memops");
+ return CurDAG->getTargetConstant(-Imm, DL, MVT::i32);
}
// XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
@@ -203,11 +207,10 @@ void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
// Intrinsics that return a a predicate.
-static unsigned doesIntrinsicReturnPredicate(unsigned ID)
-{
+static bool doesIntrinsicReturnPredicate(unsigned ID) {
switch (ID) {
default:
- return 0;
+ return false;
case Intrinsic::hexagon_C2_cmpeq:
case Intrinsic::hexagon_C2_cmpgt:
case Intrinsic::hexagon_C2_cmpgtu:
@@ -244,7 +247,7 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID)
case Intrinsic::hexagon_C2_tfrrp:
case Intrinsic::hexagon_S2_tstbit_i:
case Intrinsic::hexagon_S2_tstbit_r:
- return 1;
+ return true;
}
}
@@ -258,8 +261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
SDNode *OffsetNode = Offset.getNode();
int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
- const HexagonInstrInfo &TII = *HST->getInstrInfo();
- if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+ if (HII->isValidAutoIncImm(LoadedVT, Val)) {
SDValue TargetConst = CurDAG->getTargetConstant(Val, dl, MVT::i32);
SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
MVT::Other, Base, TargetConst,
@@ -312,8 +314,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
SDNode *OffsetNode = Offset.getNode();
int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
- const HexagonInstrInfo &TII = *HST->getInstrInfo();
- if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+ if (HII->isValidAutoIncImm(LoadedVT, Val)) {
SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -378,29 +379,46 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
// loads.
ISD::LoadExtType ExtType = LD->getExtensionType();
bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD);
+ bool HasVecOffset = false;
// Figure out the opcode.
- const HexagonInstrInfo &TII = *HST->getInstrInfo();
if (LoadedVT == MVT::i64) {
- if (TII.isValidAutoIncImm(LoadedVT, Val))
+ if (HII->isValidAutoIncImm(LoadedVT, Val))
Opcode = Hexagon::L2_loadrd_pi;
else
Opcode = Hexagon::L2_loadrd_io;
} else if (LoadedVT == MVT::i32) {
- if (TII.isValidAutoIncImm(LoadedVT, Val))
+ if (HII->isValidAutoIncImm(LoadedVT, Val))
Opcode = Hexagon::L2_loadri_pi;
else
Opcode = Hexagon::L2_loadri_io;
} else if (LoadedVT == MVT::i16) {
- if (TII.isValidAutoIncImm(LoadedVT, Val))
+ if (HII->isValidAutoIncImm(LoadedVT, Val))
Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi;
else
Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io;
} else if (LoadedVT == MVT::i8) {
- if (TII.isValidAutoIncImm(LoadedVT, Val))
+ if (HII->isValidAutoIncImm(LoadedVT, Val))
Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi;
else
Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io;
+ } else if (LoadedVT == MVT::v16i32 || LoadedVT == MVT::v8i64 ||
+ LoadedVT == MVT::v32i16 || LoadedVT == MVT::v64i8) {
+ HasVecOffset = true;
+ if (HII->isValidAutoIncImm(LoadedVT, Val)) {
+ Opcode = Hexagon::V6_vL32b_pi;
+ }
+ else
+ Opcode = Hexagon::V6_vL32b_ai;
+ // 128B
+ } else if (LoadedVT == MVT::v32i32 || LoadedVT == MVT::v16i64 ||
+ LoadedVT == MVT::v64i16 || LoadedVT == MVT::v128i8) {
+ HasVecOffset = true;
+ if (HII->isValidAutoIncImm(LoadedVT, Val)) {
+ Opcode = Hexagon::V6_vL32b_pi_128B;
+ }
+ else
+ Opcode = Hexagon::V6_vL32b_ai_128B;
} else
llvm_unreachable("unknown memory type");
@@ -411,7 +429,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD)
return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
- if (TII.isValidAutoIncImm(LoadedVT, Val)) {
+ if (HII->isValidAutoIncImm(LoadedVT, Val)) {
SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32);
SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
LD->getValueType(0),
@@ -420,15 +438,25 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = LD->getMemOperand();
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
- const SDValue Froms[] = { SDValue(LD, 0),
- SDValue(LD, 1),
- SDValue(LD, 2)
- };
- const SDValue Tos[] = { SDValue(Result, 0),
- SDValue(Result, 1),
- SDValue(Result, 2)
- };
- ReplaceUses(Froms, Tos, 3);
+ if (HasVecOffset) {
+ const SDValue Froms[] = { SDValue(LD, 0),
+ SDValue(LD, 2)
+ };
+ const SDValue Tos[] = { SDValue(Result, 0),
+ SDValue(Result, 2)
+ };
+ ReplaceUses(Froms, Tos, 2);
+ } else {
+ const SDValue Froms[] = { SDValue(LD, 0),
+ SDValue(LD, 1),
+ SDValue(LD, 2)
+ };
+ const SDValue Tos[] = { SDValue(Result, 0),
+ SDValue(Result, 1),
+ SDValue(Result, 2)
+ };
+ ReplaceUses(Froms, Tos, 3);
+ }
return Result;
} else {
SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
@@ -487,8 +515,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
// Offset value must be within representable range
// and must have correct alignment properties.
- const HexagonInstrInfo &TII = *HST->getInstrInfo();
- if (TII.isValidAutoIncImm(StoredVT, Val)) {
+ if (HII->isValidAutoIncImm(StoredVT, Val)) {
unsigned Opcode = 0;
// Figure out the post inc version of opcode.
@@ -496,7 +523,15 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi;
else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi;
else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi;
- else llvm_unreachable("unknown memory type");
+ else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
+ StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) {
+ Opcode = Hexagon::V6_vS32b_pi;
+ }
+ // 128B
+ else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
+ StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) {
+ Opcode = Hexagon::V6_vS32b_pi_128B;
+ } else llvm_unreachable("unknown memory type");
if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
@@ -530,6 +565,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io;
else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io;
else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io;
+ else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 ||
+ StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8)
+ Opcode = Hexagon::V6_vS32b_ai;
+ // 128B
+ else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 ||
+ StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8)
+ Opcode = Hexagon::V6_vS32b_ai_128B;
else llvm_unreachable("unknown memory type");
// Build regular store.
@@ -1113,14 +1155,12 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
}
if (Opc == ISD::AND) {
- if (((ValueVT == MVT::i32) &&
- (!((Val & 0x80000000) || (Val & 0x7fffffff)))) ||
- ((ValueVT == MVT::i64) &&
- (!((Val & 0x8000000000000000) || (Val & 0x7fffffff)))))
- // If it's simple AND, do the normal op.
- return SelectCode(N);
- else
+ // Check if this is a bit-clearing AND, if not select code the usual way.
+ if ((ValueVT == MVT::i32 && isPowerOf2_32(~Val)) ||
+ (ValueVT == MVT::i64 && isPowerOf2_64(~Val)))
Val = ~Val;
+ else
+ return SelectCode(N);
}
// If OR or AND is being fed by shl, srl and, sra don't do this change,
@@ -1128,7 +1168,8 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
// Traverse the DAG to see if there is shl, srl and sra.
if (Opc == ISD::OR || Opc == ISD::AND) {
switch (N->getOperand(0)->getOpcode()) {
- default: break;
+ default:
+ break;
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
@@ -1137,23 +1178,24 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
}
// Make sure it's power of 2.
- unsigned bitpos = 0;
+ unsigned BitPos = 0;
if (Opc != ISD::FABS && Opc != ISD::FNEG) {
- if (((ValueVT == MVT::i32) && !isPowerOf2_32(Val)) ||
- ((ValueVT == MVT::i64) && !isPowerOf2_64(Val)))
+ if ((ValueVT == MVT::i32 && !isPowerOf2_32(Val)) ||
+ (ValueVT == MVT::i64 && !isPowerOf2_64(Val)))
return SelectCode(N);
// Get the bit position.
- bitpos = countTrailingZeros(uint64_t(Val));
+ BitPos = countTrailingZeros(uint64_t(Val));
} else {
// For fabs and fneg, it's always the 31st bit.
- bitpos = 31;
+ BitPos = 31;
}
unsigned BitOpc = 0;
// Set the right opcode for bitwise operations.
- switch(Opc) {
- default: llvm_unreachable("Only bit-wise/abs/neg operations are allowed.");
+ switch (Opc) {
+ default:
+ llvm_unreachable("Only bit-wise/abs/neg operations are allowed.");
case ISD::AND:
case ISD::FABS:
BitOpc = Hexagon::S2_clrbit_i;
@@ -1169,7 +1211,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
SDNode *Result;
// Get the right SDVal for the opcode.
- SDValue SDVal = CurDAG->getTargetConstant(bitpos, dl, MVT::i32);
+ SDValue SDVal = CurDAG->getTargetConstant(BitPos, dl, MVT::i32);
if (ValueVT == MVT::i32 || ValueVT == MVT::f32) {
Result = CurDAG->getMachineNode(BitOpc, dl, ValueVT,
@@ -1198,7 +1240,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
MVT::i32, SDValue(Reg, 0));
// Clear/set/toggle hi or lo registers depending on the bit position.
- if (SubValueVT != MVT::f32 && bitpos < 32) {
+ if (SubValueVT != MVT::f32 && BitPos < 32) {
SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT,
SubregLO, SDVal);
const SDValue Ops[] = { RegClass, SubregHI, SubregHiIdx,
@@ -1207,7 +1249,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) {
dl, ValueVT, Ops);
} else {
if (Opc != ISD::FABS && Opc != ISD::FNEG)
- SDVal = CurDAG->getTargetConstant(bitpos - 32, dl, MVT::i32);
+ SDVal = CurDAG->getTargetConstant(BitPos-32, dl, MVT::i32);
SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT,
SubregHI, SDVal);
const SDValue Ops[] = { RegClass, SDValue(Result0, 0), SubregHiIdx,
@@ -1328,25 +1370,12 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
return false;
}
-bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const {
- unsigned UseCount = 0;
- unsigned CallCount = 0;
- for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
- // Ignore call instructions.
- if (I->getOpcode() == ISD::CopyToReg)
- ++CallCount;
- UseCount++;
- }
-
- return (UseCount <= 1) || (CallCount > 1);
-
-}
void HexagonDAGToDAGISel::PreprocessISelDAG() {
SelectionDAG &DAG = *CurDAG;
std::vector<SDNode*> Nodes;
- for (auto I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I)
- Nodes.push_back(I);
+ for (SDNode &Node : DAG.allnodes())
+ Nodes.push_back(&Node);
// Simplify: (or (select c x 0) z) -> (select c (or x z) z)
// (or (select c 0 y) z) -> (select c z (or y z))
@@ -1397,11 +1426,10 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
return;
MachineFrameInfo *MFI = MF->getFrameInfo();
- MachineBasicBlock *EntryBB = MF->begin();
+ MachineBasicBlock *EntryBB = &MF->front();
unsigned AR = FuncInfo->CreateReg(MVT::i32);
unsigned MaxA = MFI->getMaxAlignment();
- auto &HII = *HST.getInstrInfo();
- BuildMI(EntryBB, DebugLoc(), HII.get(Hexagon::ALIGNA), AR)
+ BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::ALIGNA), AR)
.addImm(MaxA);
MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR);
}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index c739afb70c15..01670902e2b0 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
#define DEBUG_TYPE "hexagon-lowering"
-static cl::opt<bool>
-EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
+static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables",
+ cl::init(true), cl::Hidden,
cl::desc("Control jump table emission on Hexagon target"));
static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched",
@@ -98,6 +98,9 @@ public:
}
// Implement calling convention for Hexagon.
+
+static bool IsHvxVectorType(MVT ty);
+
static bool
CC_Hexagon(unsigned ValNo, MVT ValVT,
MVT LocVT, CCValAssign::LocInfo LocInfo,
@@ -114,6 +117,11 @@ CC_Hexagon64(unsigned ValNo, MVT ValVT,
ISD::ArgFlagsTy ArgFlags, CCState &State);
static bool
+CC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
RetCC_Hexagon(unsigned ValNo, MVT ValVT,
MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
@@ -129,6 +137,11 @@ RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
ISD::ArgFlagsTy ArgFlags, CCState &State);
static bool
+RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -169,15 +182,43 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
return false;
}
+ if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
+ LocVT == MVT::v16i8) {
+ ofst = State.AllocateStack(16, 16);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
+ LocVT == MVT::v32i8) {
+ ofst = State.AllocateStack(32, 32);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
+ ofst = State.AllocateStack(64, 64);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
+ ofst = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+ return false;
+ }
+ if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v256i8) {
+ ofst = State.AllocateStack(256, 256);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+ return false;
+ }
+
llvm_unreachable(nullptr);
}
-static bool
-CC_Hexagon (unsigned ValNo, MVT ValVT,
- MVT LocVT, CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
+static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
if (ArgFlags.isByVal()) {
// Passed on stack.
unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
@@ -213,6 +254,17 @@ CC_Hexagon (unsigned ValNo, MVT ValVT,
return false;
}
+ if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
+ unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+
+ if (IsHvxVectorType(LocVT)) {
+ if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
+
return true; // CC didn't match.
}
@@ -260,10 +312,82 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
return false;
}
+static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+ static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1,
+ Hexagon::V2, Hexagon::V3,
+ Hexagon::V4, Hexagon::V5,
+ Hexagon::V6, Hexagon::V7,
+ Hexagon::V8, Hexagon::V9,
+ Hexagon::V10, Hexagon::V11,
+ Hexagon::V12, Hexagon::V13,
+ Hexagon::V14, Hexagon::V15};
+ static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1,
+ Hexagon::W2, Hexagon::W3,
+ Hexagon::W4, Hexagon::W5,
+ Hexagon::W6, Hexagon::W7};
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ if ((UseHVX && !UseHVXDbl) &&
+ (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
+ if (unsigned Reg = State.AllocateReg(VecLstS)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(64, 64);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if ((UseHVX && !UseHVXDbl) &&
+ (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8)) {
+ if (unsigned Reg = State.AllocateReg(VecLstD)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ // 128B Mode
+ if ((UseHVX && UseHVXDbl) &&
+ (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v256i8)) {
+ if (unsigned Reg = State.AllocateReg(VecLstD)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(256, 256);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ if ((UseHVX && UseHVXDbl) &&
+ (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
+ if (unsigned Reg = State.AllocateReg(VecLstS)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ unsigned Offset = State.AllocateStack(128, 128);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+ }
+ return true;
+}
+
static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
if (LocVT == MVT::i1 ||
LocVT == MVT::i8 ||
@@ -282,8 +406,24 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
} else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
LocVT = MVT::i64;
LocInfo = CCValAssign::BCvt;
+ } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
+ LocVT == MVT::v16i32 || LocVT == MVT::v8i64 ||
+ LocVT == MVT::v512i1) {
+ LocVT = MVT::v16i32;
+ ValVT = MVT::v16i32;
+ LocInfo = CCValAssign::Full;
+ } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
+ LocVT == MVT::v32i32 || LocVT == MVT::v16i64 ||
+ (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) {
+ LocVT = MVT::v32i32;
+ ValVT = MVT::v32i32;
+ LocInfo = CCValAssign::Full;
+ } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
+ LocVT == MVT::v64i32 || LocVT == MVT::v32i64) {
+ LocVT = MVT::v64i32;
+ ValVT = MVT::v64i32;
+ LocInfo = CCValAssign::Full;
}
-
if (LocVT == MVT::i32 || LocVT == MVT::f32) {
if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
return false;
@@ -293,7 +433,10 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
return false;
}
-
+ if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
+ if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+ return false;
+ }
return true; // CC didn't match.
}
@@ -328,6 +471,52 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
return false;
}
+static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+ MVT LocVT, CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ auto &MF = State.getMachineFunction();
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ unsigned OffSiz = 64;
+ if (LocVT == MVT::v16i32) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ } else if (LocVT == MVT::v32i32) {
+ unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0;
+ if (unsigned Reg = State.AllocateReg(Req)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ OffSiz = 128;
+ } else if (LocVT == MVT::v64i32) {
+ if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ OffSiz = 256;
+ }
+
+ unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ return false;
+}
+
+void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
+ if (VT != PromotedLdStVT) {
+ setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
+ PromotedLdStVT.getSimpleVT());
+
+ setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+ AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
+ PromotedLdStVT.getSimpleVT());
+ }
+}
+
SDValue
HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
const {
@@ -351,6 +540,15 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
MachinePointerInfo(), MachinePointerInfo());
}
+static bool IsHvxVectorType(MVT ty) {
+ return (ty == MVT::v8i64 || ty == MVT::v16i32 || ty == MVT::v32i16 ||
+ ty == MVT::v64i8 ||
+ ty == MVT::v16i64 || ty == MVT::v32i32 || ty == MVT::v64i16 ||
+ ty == MVT::v128i8 ||
+ ty == MVT::v32i64 || ty == MVT::v64i32 || ty == MVT::v128i16 ||
+ ty == MVT::v256i8 ||
+ ty == MVT::v512i1 || ty == MVT::v1024i1);
+}
// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
// passed by value, the function prototype is modified to return void and
@@ -463,19 +661,15 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Check for varargs.
int NumNamedVarArgParams = -1;
- if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
- {
- const Function* CalleeFn = nullptr;
- Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
- if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
- {
+ if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = GAN->getGlobal();
+ Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+ if (const Function* F = dyn_cast<Function>(GV)) {
// If a function has zero args and is a vararg function, that's
// disallowed so it must be an undeclared function. Do not assume
// varargs if the callee is undefined.
- if (CalleeFn->isVarArg() &&
- CalleeFn->getFunctionType()->getNumParams() != 0) {
- NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams();
- }
+ if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
+ NumNamedVarArgParams = F->getFunctionType()->getNumParams();
}
}
@@ -519,11 +713,16 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
+ bool NeedsArgAlign = false;
+ unsigned LargestAlignSeen = 0;
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Record if we need > 8 byte alignment on an argument.
+ bool ArgAlign = IsHvxVectorType(VA.getValVT());
+ NeedsArgAlign |= ArgAlign;
// Promote the value if needed.
switch (VA.getLocInfo()) {
@@ -549,13 +748,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue MemAddr = DAG.getConstant(LocMemOffset, dl,
StackPtr.getValueType());
MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr);
+ if (ArgAlign)
+ LargestAlignSeen = std::max(LargestAlignSeen,
+ VA.getLocVT().getStoreSizeInBits() >> 3);
if (Flags.isByVal()) {
// The argument is a struct passed by value. According to LLVM, "Arg"
// is is pointer.
MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
Flags, DAG, dl));
} else {
- MachinePointerInfo LocPI = MachinePointerInfo::getStack(LocMemOffset);
+ MachinePointerInfo LocPI = MachinePointerInfo::getStack(
+ DAG.getMachineFunction(), LocMemOffset);
SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI, false,
false, 0);
MemOpChains.push_back(S);
@@ -569,6 +772,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
}
+ if (NeedsArgAlign && Subtarget.hasV60TOps()) {
+ DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+ MachineFrameInfo* MFI = DAG.getMachineFunction().getFrameInfo();
+ // V6 vectors passed by value have 64 or 128 byte alignment depending
+ // on whether we are 64 byte vector mode or 128 byte.
+ bool UseHVXDbl = Subtarget.useHVXDblOps();
+ assert(Subtarget.useHVXOps());
+ const unsigned ObjAlign = UseHVXDbl ? 128 : 64;
+ LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+ MFI->ensureMaxAlignment(LargestAlignSeen);
+ }
// Transform all store nodes into one single node because all store
// nodes are independent of each other.
if (!MemOpChains.empty())
@@ -613,12 +827,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
- if (flag_aligned_memcpy) {
- const char *MemcpyName =
- "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
- Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT);
- flag_aligned_memcpy = false;
- } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT);
} else if (ExternalSymbolSDNode *S =
dyn_cast<ExternalSymbolSDNode>(Callee)) {
@@ -668,7 +877,19 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
if (Ptr->getOpcode() != ISD::ADD)
return false;
- if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+ auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
+ bool UseHVX = HST.useHVXOps();
+ bool UseHVXDbl = HST.useHVXDblOps();
+
+ bool ValidHVXDblType =
+ (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+ VT == MVT::v64i16 || VT == MVT::v128i8);
+ bool ValidHVXType =
+ UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+ VT == MVT::v32i16 || VT == MVT::v64i8);
+
+ if (ValidHVXDblType || ValidHVXType ||
+ VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
isInc = (Ptr->getOpcode() == ISD::ADD);
Base = Ptr->getOperand(0);
Offset = Ptr->getOperand(1);
@@ -679,23 +900,6 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
return false;
}
-// TODO: Put this function along with the other isS* functions in
-// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the
-// functions defined in HexagonOperands.td.
-static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) {
- ConstantSDNode *N = cast<ConstantSDNode>(S);
-
- // immS4 predicate - True if the immediate fits in a 4-bit sign extended.
- // field.
- int64_t v = (int64_t)N->getSExtValue();
- int64_t m = 0;
- if (ShiftAmount > 0) {
- m = v % ShiftAmount;
- v = v >> ShiftAmount;
- }
- return (v <= 7) && (v >= -8) && (m == 0);
-}
-
/// getPostIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if this node can be
/// combined with a load / store to form a post-indexed load / store.
@@ -724,18 +928,20 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isInc = false;
bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
isInc, DAG);
- // ShiftAmount = number of left-shifted bits in the Hexagon instruction.
- int ShiftAmount = VT.getSizeInBits() / 16;
- if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) {
- AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
- return true;
+ if (isLegal) {
+ auto &HII = *Subtarget.getInstrInfo();
+ int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+ if (HII.isValidAutoIncImm(VT, OffsetVal)) {
+ AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+ return true;
+ }
}
return false;
}
-SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue
+HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
MachineFunction &MF = DAG.getMachineFunction();
auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
@@ -784,47 +990,6 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
return Op;
}
-
-//
-// Taken from the XCore backend.
-//
-SDValue HexagonTargetLowering::
-LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
-{
- SDValue Chain = Op.getOperand(0);
- SDValue Table = Op.getOperand(1);
- SDValue Index = Op.getOperand(2);
- SDLoc dl(Op);
- JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
- unsigned JTI = JT->getIndex();
- MachineFunction &MF = DAG.getMachineFunction();
- const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
- SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
-
- // Mark all jump table targets as address taken.
- const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables();
- const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs;
- for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
- MachineBasicBlock *MBB = JTBBs[i];
- MBB->setHasAddressTaken();
- // This line is needed to set the hasAddressTaken flag on the BasicBlock
- // object.
- BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
- }
-
- SDValue JumpTableBase = DAG.getNode(
- HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT);
- SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
- DAG.getConstant(2, dl, MVT::i32));
- SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
- ShiftIndex);
- SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress,
- MachinePointerInfo(), false, false, false,
- 0);
- return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget);
-}
-
-
SDValue
HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
@@ -850,7 +1015,10 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue AC = DAG.getConstant(A, dl, MVT::i32);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
- return DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+ SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+ if (Op.getNode()->getHasDebugValue())
+ DAG.TransferDbgValues(Op, AA);
+ return AA;
}
SDValue
@@ -882,7 +1050,8 @@ const {
// equal to) 8 bytes. If not, no address will be passed into callee and
// callee return the result direclty through R0/R1.
- SmallVector<SDValue, 4> MemOps;
+ SmallVector<SDValue, 8> MemOps;
+ bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -908,6 +1077,42 @@ const {
RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+ // Single Vector
+ } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 ||
+ RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (UseHVX && UseHVXDbl &&
+ ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+ RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+ // Double Vector
+ } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+ RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (UseHVX && UseHVXDbl &&
+ ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 ||
+ RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+ } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
+ assert(0 && "need to support VecPred regs");
+ unsigned VReg =
+ RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass);
+ RegInfo.addLiveIn(VA.getLocReg(), VReg);
+ InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
} else {
assert (0);
}
@@ -1056,8 +1261,8 @@ SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG)
- const {
+SDValue
+HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue PredOp = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
EVT OpVT = Op1.getValueType();
@@ -1163,16 +1368,33 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue
HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
EVT ValTy = Op.getValueType();
- SDLoc dl(Op);
- ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
- SDValue Res;
- if (CP->isMachineConstantPoolEntry())
- Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
- CP->getAlignment());
+ ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
+ unsigned Align = CPN->getAlignment();
+ Reloc::Model RM = HTM.getRelocationModel();
+ unsigned char TF = (RM == Reloc::PIC_) ? HexagonII::MO_PCREL : 0;
+
+ SDValue T;
+ if (CPN->isMachineConstantPoolEntry())
+ T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, TF);
else
- Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
- CP->getAlignment());
- return DAG.getNode(HexagonISD::CP, dl, ValTy, Res);
+ T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, TF);
+ if (RM == Reloc::PIC_)
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T);
+ return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ int Idx = cast<JumpTableSDNode>(Op)->getIndex();
+ Reloc::Model RM = HTM.getRelocationModel();
+ if (RM == Reloc::PIC_) {
+ SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T);
+ }
+
+ SDValue T = DAG.getTargetJumpTable(Idx, VT);
+ return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T);
}
SDValue
@@ -1219,52 +1441,70 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
return FrameAddr;
}
-SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
- SelectionDAG& DAG) const {
+SDValue
+HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const {
SDLoc dl(Op);
return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
}
-SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Result;
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+SDValue
+HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
+ auto *GAN = cast<GlobalAddressSDNode>(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+ auto *GV = GAN->getGlobal();
+ int64_t Offset = GAN->getOffset();
+
+ auto &HLOF = *HTM.getObjFileLowering();
+ Reloc::Model RM = HTM.getRelocationModel();
- const HexagonTargetObjectFile *TLOF =
- static_cast<const HexagonTargetObjectFile *>(
- getTargetMachine().getObjFileLowering());
- if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) {
- return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result);
+ if (RM == Reloc::Static) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+ if (HLOF.IsGlobalInSmallSection(GV, HTM))
+ return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
+ return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
}
- return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result);
+ bool UsePCRel = GV->hasInternalLinkage() || GV->hasHiddenVisibility() ||
+ (GV->hasLocalLinkage() && !isa<Function>(GV));
+ if (UsePCRel) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
+ HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA);
+ }
+
+ // Use GOT index.
+ SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+ SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT);
+ SDValue Off = DAG.getConstant(Offset, dl, MVT::i32);
+ return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off);
}
// Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
-void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) {
- if (VT != PromotedLdStVT) {
- setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::LOAD, VT.getSimpleVT(),
- PromotedLdStVT.getSimpleVT());
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ SDLoc dl(Op);
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
- setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
- AddPromotedToType(ISD::STORE, VT.getSimpleVT(),
- PromotedLdStVT.getSimpleVT());
+ Reloc::Model RM = HTM.getRelocationModel();
+ if (RM == Reloc::Static) {
+ SDValue A = DAG.getTargetBlockAddress(BA, PtrVT);
+ return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A);
}
+
+ SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A);
}
SDValue
-HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
- const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
- SDValue BA_SD = DAG.getTargetBlockAddress(BA, MVT::i32);
- SDLoc dl(Op);
- return DAG.getNode(HexagonISD::CONST32_GP, dl,
- getPointerTy(DAG.getDataLayout()), BA_SD);
+HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
+ const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT,
+ HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym);
}
//===----------------------------------------------------------------------===//
@@ -1272,18 +1512,19 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
//===----------------------------------------------------------------------===//
HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
- const HexagonSubtarget &STI)
+ const HexagonSubtarget &ST)
: TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
- Subtarget(STI) {
+ Subtarget(ST) {
bool IsV4 = !Subtarget.hasV5TOps();
auto &HRI = *Subtarget.getRegisterInfo();
+ bool UseHVX = Subtarget.useHVXOps();
+ bool UseHVXSgl = Subtarget.useHVXSglOps();
+ bool UseHVXDbl = Subtarget.useHVXDblOps();
setPrefLoopAlignment(4);
setPrefFunctionAlignment(4);
setMinFunctionAlignment(2);
setInsertFencesForAtomic(false);
- setExceptionPointerRegister(Hexagon::R0);
- setExceptionSelectorRegister(Hexagon::R1);
setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
if (EnableHexSDNodeSched)
@@ -1320,6 +1561,31 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
}
+ if (Subtarget.hasV60TOps()) {
+ if (Subtarget.useHVXSglOps()) {
+ addRegisterClass(MVT::v64i8, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v8i64, &Hexagon::VectorRegsRegClass);
+ addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass);
+ addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass);
+ } else if (Subtarget.useHVXDblOps()) {
+ addRegisterClass(MVT::v128i8, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v64i16, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v32i32, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v16i64, &Hexagon::VectorRegs128BRegClass);
+ addRegisterClass(MVT::v256i8, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v64i32, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v32i64, &Hexagon::VecDblRegs128BRegClass);
+ addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass);
+ }
+
+ }
+
//
// Handling of scalar operations.
//
@@ -1336,10 +1602,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+ setOperationAction(ISD::JumpTable, MVT::i32, Custom);
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+ setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
// Custom legalize GlobalAddress nodes into CONST32.
@@ -1361,11 +1629,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
if (EmitJumpTables)
- setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+ setMinimumJumpTableEntries(2);
else
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- // Increase jump tables cutover to 5, was 4.
- setMinimumJumpTableEntries(MinimumJumpTables);
+ setMinimumJumpTableEntries(MinimumJumpTables);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
// Hexagon has instructions for add/sub with carry. The problem with
// modeling these instructions is that they produce 2 results: Rdd and Px.
@@ -1420,9 +1687,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MULHS, MVT::i64, Expand);
for (unsigned IntExpOp :
- {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM,
- ISD::ROTL, ISD::ROTR, ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS,
- ISD::SRL_PARTS, ISD::SMUL_LOHI, ISD::UMUL_LOHI}) {
+ { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
+ ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR,
+ ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+ ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
setOperationAction(IntExpOp, MVT::i32, Expand);
setOperationAction(IntExpOp, MVT::i64, Expand);
}
@@ -1475,7 +1743,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
// Set the action for vector operations to "expand", then override it with
// either "custom" or "legal" for specific cases.
- static unsigned VectExpOps[] = {
+ static const unsigned VectExpOps[] = {
// Integer arithmetic:
ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV,
ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
@@ -1539,7 +1807,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v2i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
-
+ if (UseHVX) {
+ if (UseHVXSgl) {
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom);
+ } else if (UseHVXDbl) {
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom);
+ } else {
+ llvm_unreachable("Unrecognized HVX mode");
+ }
+ }
// Subtarget-specific operation actions.
//
if (Subtarget.hasV5TOps()) {
@@ -1586,7 +1868,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
for (ISD::CondCode FPExpCCV4 :
{ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
- ISD::SETUO, ISD::SETO}) {
+ ISD::SETUO, ISD::SETO}) {
setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
}
@@ -1599,6 +1881,13 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setIndexedStoreAction(ISD::POST_INC, LSXTy, Legal);
}
+ if (UseHVXDbl) {
+ for (MVT VT : {MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) {
+ setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+ setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+ }
+ }
+
computeRegisterProperties(&HRI);
//
@@ -1720,7 +2009,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT";
case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL";
case HexagonISD::BARRIER: return "HexagonISD::BARRIER";
- case HexagonISD::BR_JT: return "HexagonISD::BR_JT";
case HexagonISD::CALLR: return "HexagonISD::CALLR";
case HexagonISD::CALLv3nr: return "HexagonISD::CALLv3nr";
case HexagonISD::CALLv3: return "HexagonISD::CALLv3";
@@ -1737,7 +2025,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::INSERTRP: return "HexagonISD::INSERTRP";
case HexagonISD::JT: return "HexagonISD::JT";
case HexagonISD::PACKHL: return "HexagonISD::PACKHL";
- case HexagonISD::PIC_ADD: return "HexagonISD::PIC_ADD";
case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT";
case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG";
case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB";
@@ -1754,6 +2041,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ";
case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT";
case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU";
+ case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE";
case HexagonISD::VSHLH: return "HexagonISD::VSHLH";
case HexagonISD::VSHLW: return "HexagonISD::VSHLW";
case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB";
@@ -1923,8 +2211,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned Size = VT.getSizeInBits();
- // A vector larger than 64 bits cannot be represented in Hexagon.
- // Expand will split the vector.
+ // Only handle vectors of 64 bits or shorter.
if (Size > 64)
return SDValue();
@@ -2058,58 +2345,61 @@ SDValue
HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
+ bool UseHVX = Subtarget.useHVXOps();
EVT VT = Op.getValueType();
unsigned NElts = Op.getNumOperands();
- SDValue Vec = Op.getOperand(0);
- EVT VecVT = Vec.getValueType();
- SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64);
- SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
- DAG.getConstant(32, dl, MVT::i64));
- SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64);
-
- ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
- ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted);
-
- if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) {
- if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
- // We are trying to concat two v2i16 to a single v4i16.
- SDValue Vec0 = Op.getOperand(1);
- SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
- return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+ SDValue Vec0 = Op.getOperand(0);
+ EVT VecVT = Vec0.getValueType();
+ unsigned Width = VecVT.getSizeInBits();
+
+ if (NElts == 2) {
+ MVT ST = VecVT.getSimpleVT();
+ // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+ // into a single v8i8.
+ if (ST == MVT::v2i16 || ST == MVT::v4i8)
+ return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+ if (UseHVX) {
+ assert((Width == 64*8 && Subtarget.useHVXSglOps()) ||
+ (Width == 128*8 && Subtarget.useHVXDblOps()));
+ SDValue Vec1 = Op.getOperand(1);
+ MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+ MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+ SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+ SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+ SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+ return DAG.getNode(ISD::BITCAST, dl, VT, VC);
}
}
- if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) {
- if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
- // We are trying to concat two v4i8 to a single v8i8.
- SDValue Vec0 = Op.getOperand(1);
- SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
- return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
- }
- }
+ if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+ return SDValue();
+
+ SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+ SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+ SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+ // Create the "width" part of the argument to insert_rp/insertp_rp.
+ SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+ SDValue V = C0;
for (unsigned i = 0, e = NElts; i != e; ++i) {
- unsigned OpIdx = NElts - i - 1;
- SDValue Operand = Op.getOperand(OpIdx);
+ unsigned N = NElts-i-1;
+ SDValue OpN = Op.getOperand(N);
- if (VT.getSizeInBits() == 64 &&
- Operand.getValueType().getSizeInBits() == 32) {
+ if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) {
SDValue C = DAG.getConstant(0, dl, MVT::i32);
- Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+ OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
}
-
- SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
- SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
- SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
- const SDValue Ops[] = {ConstVal, Operand, Combined};
-
+ SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+ SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
if (VT.getSizeInBits() == 32)
- ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+ V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
else
- ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+ V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
}
- return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+ return DAG.getNode(ISD::BITCAST, dl, VT, V);
}
SDValue
@@ -2301,6 +2591,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SHL:
case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
// Frame & Return address. Currently unimplemented.
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
@@ -2308,8 +2599,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG);
case ISD::GlobalAddress: return LowerGLOBALADDRESS(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
- case ISD::BR_JT: return LowerBR_JT(Op, DAG);
// Custom lower some vector loads.
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
@@ -2321,6 +2612,16 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}
+/// Returns relocation base for the given PIC jumptable.
+SDValue
+HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ int Idx = cast<JumpTableSDNode>(Table)->getIndex();
+ EVT VT = Table.getValueType();
+ SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+ return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T);
+}
+
MachineBasicBlock *
HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB)
@@ -2343,6 +2644,8 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
std::pair<unsigned, const TargetRegisterClass *>
HexagonTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r': // R0-R31
@@ -2358,6 +2661,42 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
case MVT::f64:
return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
}
+ case 'q': // q0-q3
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+ case MVT::v1024i1:
+ case MVT::v512i1:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v64i8:
+ case MVT::v8i64:
+ return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+ }
+ case 'v': // V0-V31
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8:
+ case MVT::v8i64:
+ return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
+ case MVT::v32i32:
+ case MVT::v64i16:
+ case MVT::v16i64:
+ case MVT::v128i8:
+ if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
+ return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
+ else
+ return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
+ case MVT::v256i8:
+ case MVT::v128i16:
+ case MVT::v64i32:
+ case MVT::v32i64:
+ return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
+ }
+
default:
llvm_unreachable("Unknown asm register class");
}
@@ -2397,6 +2736,14 @@ bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
return true;
}
+/// Return true if folding a constant offset with the given GlobalAddress is
+/// legal. It is frequently not legal in PIC relocation models.
+bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA)
+ const {
+ return HTM.getRelocationModel() == Reloc::Static;
+}
+
+
/// isLegalICmpImmediate - Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can compare
/// a register against the immediate without having to materialize the
@@ -2428,8 +2775,8 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
// ***************************************************************************
// If this is a tail call via a function pointer, then don't do it!
- if (!(dyn_cast<GlobalAddressSDNode>(Callee))
- && !(dyn_cast<ExternalSymbolSDNode>(Callee))) {
+ if (!(isa<GlobalAddressSDNode>(Callee)) &&
+ !(isa<ExternalSymbolSDNode>(Callee))) {
return false;
}
@@ -2467,6 +2814,41 @@ bool llvm::isPositiveHalfWord(SDNode *N) {
}
}
+std::pair<const TargetRegisterClass*, uint8_t>
+HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
+ const TargetRegisterClass *RRC = nullptr;
+
+ uint8_t Cost = 1;
+ switch (VT.SimpleTy) {
+ default:
+ return TargetLowering::findRepresentativeClass(TRI, VT);
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ RRC = &Hexagon::VectorRegsRegClass;
+ break;
+ case MVT::v128i8:
+ case MVT::v64i16:
+ case MVT::v32i32:
+ case MVT::v16i64:
+ if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
+ Subtarget.useHVXDblOps())
+ RRC = &Hexagon::VectorRegs128BRegClass;
+ else
+ RRC = &Hexagon::VecDblRegsRegClass;
+ break;
+ case MVT::v256i8:
+ case MVT::v128i16:
+ case MVT::v64i32:
+ case MVT::v32i64:
+ RRC = &Hexagon::VecDblRegs128BRegClass;
+ break;
+ }
+ return std::make_pair(RRC, Cost);
+}
+
Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
BasicBlock *BB = Builder.GetInsertBlock();
@@ -2498,13 +2880,15 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
return Ext;
}
-bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// Do not expand loads and stores that don't exceed 64 bits.
- return LI->getType()->getPrimitiveSizeInBits() > 64;
+ return LI->getType()->getPrimitiveSizeInBits() > 64
+ ? AtomicExpansionKind::LLOnly
+ : AtomicExpansionKind::None;
}
bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// Do not expand loads and stores that don't exceed 64 bits.
return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
}
-
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 2642abffaddd..bf378b922220 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -35,16 +35,14 @@ bool isPositiveHalfWord(SDNode *N);
ALLOCA,
ARGEXTEND,
- PIC_ADD,
- AT_GOT,
- AT_PCREL,
+ AT_GOT, // Index in GOT.
+ AT_PCREL, // Offset relative to PC.
CALLv3, // A V3+ call instruction.
CALLv3nr, // A V3+ call instruction that doesn't return.
CALLR,
RET_FLAG, // Return with a flag operand.
- BR_JT, // Branch through jump table.
BARRIER, // Memory barrier.
JT, // Jump table.
CP, // Constant pool.
@@ -80,6 +78,7 @@ bool isPositiveHalfWord(SDNode *N);
INSERTRP,
EXTRACTU,
EXTRACTURP,
+ VCOMBINE,
TC_RETURN,
EH_RETURN,
DCFETCH,
@@ -127,7 +126,6 @@ bool isPositiveHalfWord(SDNode *N);
SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
@@ -137,6 +135,7 @@ bool isPositiveHalfWord(SDNode *N);
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
@@ -163,8 +162,23 @@ bool isPositiveHalfWord(SDNode *N);
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const override;
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R1;
+ }
+
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
EVT VT) const override {
if (!VT.isVector())
@@ -200,6 +214,10 @@ bool isPositiveHalfWord(SDNode *N);
/// TODO: Handle pre/postinc as well.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
Type *Ty, unsigned AS) const override;
+ /// Return true if folding a constant offset with the given GlobalAddress
+ /// is legal. It is frequently not legal in PIC relocation models.
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
/// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -208,20 +226,26 @@ bool isPositiveHalfWord(SDNode *N);
/// the immediate into a register.
bool isLegalICmpImmediate(int64_t Imm) const override;
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+ const override;
+
// Handling of atomic RMW instructions.
- bool hasLoadLinkedStoreConditional() const override {
- return true;
- }
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
- bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI)
- const override {
- return AtomicRMWExpansionKind::LLSC;
+ AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+ return AtomicExpansionKind::LLSC;
}
+
+ protected:
+ std::pair<const TargetRegisterClass*, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+ const override;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonInstrAlias.td b/lib/Target/Hexagon/HexagonInstrAlias.td
new file mode 100644
index 000000000000..5a1a69b40d4d
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrAlias.td
@@ -0,0 +1,462 @@
+//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Mappings
+//===----------------------------------------------------------------------===//
+
+
+def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
+ (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
+ (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
+ (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memb({GP}+#$addr) = $Nt",
+ (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt",
+ (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
+ (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt",
+ (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memd({GP}+#$addr) = $Nt",
+ (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
+
+def : InstAlias<"$Nt = memb({GP}+#$addr)",
+ (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memub({GP}+#$addr)",
+ (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memh({GP}+#$addr)",
+ (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memuh({GP}+#$addr)",
+ (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memw({GP}+#$addr)",
+ (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
+def : InstAlias<"$Nt = memd({GP}+#$addr)",
+ (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
+
+// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
+def : InstAlias<"memb($Rs) = $Rt",
+ (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt",
+ (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.h",
+ (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt",
+ (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = $Rt.new",
+ (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.new",
+ (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt.new",
+ (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = #$S8",
+ (S4_storeirb_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memh($Rs) = #$S8",
+ (S4_storeirh_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memw($Rs) = #$S8",
+ (S4_storeiri_io IntRegs:$Rs, 0, s8Ext:$S8), 0>;
+
+def : InstAlias<"memd($Rs) = $Rtt",
+ (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"memb($Rs) = setbit(#$U5)",
+ (L4_ior_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = setbit(#$U5)",
+ (L4_ior_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = setbit(#$U5)",
+ (L4_ior_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memb($Rs) = clrbit(#$U5)",
+ (L4_iand_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = clrbit(#$U5)",
+ (L4_iand_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = clrbit(#$U5)",
+ (L4_iand_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>;
+
+// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
+def : InstAlias<"$Rd = memb($Rs)",
+ (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memub($Rs)",
+ (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memh($Rs)",
+ (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memuh($Rs)",
+ (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memw($Rs)",
+ (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memd($Rs)",
+ (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memubh($Rs)",
+ (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memubh($Rs)",
+ (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = membh($Rs)",
+ (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = membh($Rs)",
+ (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memb_fifo($Rs)",
+ (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memh_fifo($Rs)",
+ (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
+// to: if ($Pt) $Rd = memXX($Rs)
+def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
+ (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
+ (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
+ (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
+ (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
+ (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
+ (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
+// to: if ($Pt) memXX($Rs) = $Rt
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
+ (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
+ (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
+ (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
+ (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
+ (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
+ (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
+ (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
+ (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
+ (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
+ (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
+ (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+
+// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
+// to: if (!$Pt) $Rd = memXX($Rs)
+def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
+ (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
+ (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
+ (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
+ (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
+ (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
+ (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
+// to: if (!$Pt) memXX($Rs) = $Rt
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
+ (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
+ (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
+ (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
+ (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
+ (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
+ (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
+ (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
+ (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
+ (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
+ (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
+ (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
+ (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
+ (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
+ (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
+ (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
+ (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
+ (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
+ (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
+ (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
+ (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
+ (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
+ (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
+ (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>;
+
+// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
+// to: memXX($Rs) |= $Rt
+def : InstAlias<"memb($Rs) &= $Rt",
+ (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) |= $Rt",
+ (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += $Rt",
+ (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= $Rt",
+ (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += #$U5",
+ (L4_iadd_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= #$U5",
+ (L4_isub_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) &= $Rt",
+ (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) |= $Rt",
+ (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += $Rt",
+ (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= $Rt",
+ (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += #$U5",
+ (L4_iadd_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= #$U5",
+ (L4_isub_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) &= $Rt",
+ (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) |= $Rt",
+ (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += $Rt",
+ (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= $Rt",
+ (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += #$U5",
+ (L4_iadd_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= #$U5",
+ (L4_isub_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>,
+ Requires<[UseMEMOP]>;
+
+//
+// Alias of: if ($Pv.new) memX($Rs) = $Rt
+// to: if (p3.new) memX(r17 + #0) = $Rt
+def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
+ (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
+ (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
+ (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
+ (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
+ (S4_pstorerdtnew_io
+ PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
+ (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
+ (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
+ (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
+ (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
+ (S4_pstorerdfnew_io
+ PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+//
+// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
+// to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
+def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
+ (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
+ (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
+ (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
+ (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
+ (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
+ (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
+ (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
+ (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
+ (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
+ (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
+ (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
+ (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"dcfetch($Rs)",
+ (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
+
+// Alias of some insn mappings, others must be handled by the parser
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+ (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+ (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
+def : InstAlias<"$Rd = neg($Rs)",
+ (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
+
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+def : InstAlias<"$Pd = $Ps",
+ (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
+
+def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
+ (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
+
+def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
+ (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
+ (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
+
+// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+ (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+ (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
diff --git a/lib/Target/Hexagon/HexagonInstrEnc.td b/lib/Target/Hexagon/HexagonInstrEnc.td
new file mode 100644
index 000000000000..280832fd167f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrEnc.td
@@ -0,0 +1,1019 @@
+class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { opc{14-4}, src2};
+ let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
+}
+
+class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
+class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
+class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
+class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
+class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
+class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
+class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
+class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
+class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
+class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
+class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
+class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
+class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
+class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
+class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
+class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
+class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
+class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
+class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
+class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
+class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
+class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
+class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
+class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
+class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
+class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
+class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
+class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
+class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
+class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
+class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
+class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
+class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
+class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
+class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
+class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
+class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
+class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
+class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
+class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
+class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
+class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
+class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
+class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
+class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
+class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
+class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
+class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
+class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
+class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
+class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
+class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
+class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
+class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
+class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
+class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
+class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
+class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
+class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
+class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
+class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
+class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
+class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
+class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
+class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
+class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
+class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
+class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
+class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
+class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
+class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
+class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
+class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
+class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
+class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
+class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
+class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
+class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
+class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
+class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
+class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
+class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
+class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
+class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
+class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
+class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
+class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
+class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
+class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
+class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
+class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
+class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
+class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
+class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
+class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
+class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
+class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
+class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
+class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
+class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
+class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
+class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
+class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
+class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
+class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
+class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
+class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
+class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
+class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
+class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
+class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
+class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
+class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
+class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
+class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
+class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
+class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
+class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
+class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
+class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
+class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
+class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
+class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
+class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
+class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
+class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
+class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
+class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
+class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
+class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
+class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
+class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
+class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
+class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
+class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
+class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
+class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
+class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
+class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
+class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
+class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
+class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
+class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
+class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
+class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
+class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
+class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
+class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
+class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
+class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
+class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
+class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
+class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
+class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
+class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
+class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
+class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
+class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
+class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
+class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
+class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
+class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
+class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
+class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
+class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
+class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
+class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
+class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
+class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
+class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
+class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
+class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
+class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
+class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
+class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
+class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
+class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
+class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
+class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
+class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
+class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
+class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
+class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
+class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
+class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
+
+class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
+ let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
+}
+
+class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
+class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
+class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
+class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
+class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
+class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
+class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
+class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
+class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
+class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
+class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
+class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
+class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
+class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
+class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
+class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
+class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
+class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
+class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
+class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
+class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
+class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
+class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
+class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
+class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
+class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
+class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
+class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
+class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
+class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
+class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
+class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
+class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
+class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
+class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
+class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
+class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
+class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
+
+class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
+ let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
+class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
+class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
+class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
+class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
+class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
+class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
+class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
+class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
+class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
+class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
+class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
+
+class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { 0b00011110000000, opc{5-4} };
+ let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
+class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
+class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
+class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
+class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
+class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
+class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
+class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
+class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
+class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
+class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
+class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
+class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
+class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
+class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
+class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
+class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
+class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
+class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
+class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
+class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
+class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
+class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
+class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
+class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
+
+class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
+class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
+class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
+class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
+class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
+class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
+class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
+class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
+class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
+class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
+class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
+class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
+class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
+
+class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
+class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
+class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
+
+class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
+class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
+class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<4> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{9-6};
+ let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
+class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<11> src2;
+ bits<4> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{10-7};
+ let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
+class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<4> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{9-6};
+ let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<11> src3;
+ bits<4> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{10-7};
+ let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
+class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
+class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
+class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
+class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
+class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
+class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
+class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
+class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
+class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
+
+class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
+class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
+class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
+class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
+class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
+class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
+class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
+class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
+class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
+class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<4> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{9-6};
+ let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
+class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
+class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
+class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<11> src3;
+ bits<4> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{10-7};
+ let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
+class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
+class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
+class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
+
+// TODO: Change script to generate dst, src1, src2 instead of
+// dst, dst2, src1.
+class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
+class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
+class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
+class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
+class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
+class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
+class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
+class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
+class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
+class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
+class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
+class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
+class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
+
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+ let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
+class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
+class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
+
+class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+ bits<5> src3;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+ let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
+class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
+class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<9> src2;
+ bits<3> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{8-6};
+ let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
+class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<10> src2;
+ bits<3> src2_vector;
+ bits<3> src3;
+
+ let src2_vector = src2{9-7};
+ let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
+class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3;
+ bits<3> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{8-6};
+ let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
+class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
+class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
+class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
+class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
+class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
+class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
+class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
+class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
+class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<3> src3_vector;
+ bits<5> src4;
+
+ let src3_vector = src3{9-7};
+ let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
+class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
+class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
+class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
+class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
+class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
+class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
+class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
+class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
+class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<9> src3;
+ bits<3> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{8-6};
+ let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
+class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
+class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
+class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<10> src3;
+ bits<3> src3_vector;
+ bits<3> src4;
+
+ let src3_vector = src3{9-7};
+ let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
+class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
+class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
+class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
+
+class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<1> src2;
+
+ let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
+class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
+class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
+class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
+class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
+class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
+class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
+
+class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<1> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
+class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
+class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<1> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
+ let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
+}
+
+class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
+class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<1> src3;
+ bits<5> src4;
+
+ let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
+ let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
+class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
+class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
+class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
+class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
+class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
+class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
+class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
+class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
+class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> src2;
+ bits<1> src3;
+ bits<3> src4;
+
+ let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
+ let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
+class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
+class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
+class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
+
+
+class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<1> src3;
+
+ let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
+ let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
+}
+
+class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
+class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
+class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
+class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
+class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
+class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
+
+class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
+ let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
+}
+
+class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
+class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
+
+class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
+ bits<5> src1;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b00011001111, src3{4-0} };
+ let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
+}
+
+class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
+class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
+
+
+class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
+ let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
+class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
+
+class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
+ bits<2> src1;
+ bits<5> dst;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
+ let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
+}
+
+class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
+class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
+
+class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
+ let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
+class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
+class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
+class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
+class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
+class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
+class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
+class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
+class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
+class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
+class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
+class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
+class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
+class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
+class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
+
+class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<3> src3;
+
+ let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
+ let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
+}
+
+class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
+class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
+class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
+class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
+class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
+class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
+class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
+class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
+
+class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<2> src1;
+ bits<2> src2;
+
+ let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
+ let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
+}
+
+class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
+class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
+class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
+class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
+class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
+
+class V6_pred_not_enc : OpcodeHexagon {
+ bits<2> dst;
+ bits<2> src1;
+
+ let Inst{31-16} = { 0b0001111000000011 };
+ let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
+}
+
+class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<2> src1;
+ bits<5> src2;
+ bits<5> src3;
+
+ let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
+ let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
+class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
+
+class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{15-5}, src1{4-0} };
+ let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
+}
+
+class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
+class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
+class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
+
+
+class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
+ bits<2> dst;
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
+ let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
+}
+
+class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
+class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
+
+class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<6> src2;
+
+ let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
+ let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
+class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
+class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
+class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
+class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
+class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
+
+class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { opc{14-4}, src1{4-0} };
+ let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
+class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
+class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
+class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
+class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
+class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
+class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
+class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
+
+class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
+
+ let Inst{31-16} = { opc{24-10}, 0 };
+ let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
+}
+
+class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
+class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
+class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
+class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
+
+class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
+ bits<5> src1;
+
+ let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
+ let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
+}
+
+class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
+class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
+
+class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
+ bits<5> src1;
+
+ let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
+ let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
+}
+
+class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
+class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
+
+class A5_ACS_enc : OpcodeHexagon {
+ bits<5> dst1;
+ bits<2> dst2;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b11101010101, src1{4-0} };
+ let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
+
+class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
+ bits<5> dst;
+ bits<5> src1;
+ bits<5> src2;
+ bits<2> src3;
+
+ let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
+ let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
+}
+
+class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
+class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
+class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
+
+class V6_vhistq_enc : OpcodeHexagon {
+ bits<2> src1;
+
+ let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
+ let Inst{13-0} = { 0b10000010000000 };
+}
+
+// TODO: Change script to generate dst1 instead of dst.
+class A6_vminub_RdP_enc : OpcodeHexagon {
+ bits<5> dst1;
+ bits<2> dst2;
+ bits<5> src1;
+ bits<5> src2;
+
+ let Inst{31-16} = { 0b11101010111, src2{4-0} };
+ let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 44bab292f32c..3c5ec1701dc2 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -34,6 +34,8 @@ class SubTarget<bits<6> value> {
def HasAnySubT : SubTarget<0x3f>; // 111111
def HasV5SubT : SubTarget<0x3e>; // 111110
+def HasV55SubT : SubTarget<0x3c>; // 111100
+def HasV60SubT : SubTarget<0x38>; // 111000
// Addressing modes for load/store instructions
class AddrModeType<bits<3> value> {
@@ -57,6 +59,8 @@ def ByteAccess : MemAccessSize<1>;// Byte access instruction (memb).
def HalfWordAccess : MemAccessSize<2>;// Half word access instruction (memh).
def WordAccess : MemAccessSize<3>;// Word access instruction (memw).
def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+def Vector64Access : MemAccessSize<7>;// Vector access instruction (memv)
+def Vector128Access : MemAccessSize<8>;// Vector access instruction (memv)
//===----------------------------------------------------------------------===//
@@ -167,14 +171,23 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
bits<1> isFP = 0;
let TSFlags {48} = isFP; // Floating-point.
+ bits<1> hasNewValue2 = 0;
+ let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+ bits<3> opNewValue2 = 0;
+ let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+
+ bits<1> isAccumulator = 0;
+ let TSFlags{54} = isAccumulator;
+
// Fields used for relation models.
+ bit isNonTemporal = 0;
+ string isNT = ""; // set to "true" for non-temporal vector stores.
string BaseOpcode = "";
string CextOpcode = "";
string PredSense = "";
string PNewValue = "";
string NValueST = ""; // Set to "true" for new-value stores.
string InputType = ""; // Input is "imm" or "reg" type.
- string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
string isFloat = "false"; // Set to "true" for the floating-point load/store.
string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions
@@ -182,6 +195,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
"");
let PNewValue = !if(isPredicatedNew, "new", "");
let NValueST = !if(isNVStore, "true", "false");
+ let isNT = !if(isNonTemporal, "true", "false");
// *** Must match MCTargetDesc/HexagonBaseInfo.h ***
}
@@ -217,6 +231,11 @@ class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
+let mayLoad = 1 in
+class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
// ST Instruction Class in V2/V3 can take SLOT0 only.
// ST Instruction Class in V4 can take SLOT0 & SLOT1.
// Definition of the instruction class CHANGED from V2/V3 to V4.
@@ -234,6 +253,12 @@ class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
+// Same as ST0Inst but doesn't derive from OpcodeHexagon.
+let mayStore = 1 in
+class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
// ST Instruction Class in V2/V3 can take SLOT0 only.
// ST Instruction Class in V4 can take SLOT0 & SLOT1.
// Definition of the instruction class CHANGED from V2/V3 to V4.
@@ -277,6 +302,11 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
OpcodeHexagon;
+// Same as above but doesn't derive from OpcodeHexagon
+class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
// M Instruction Class in V2/V3.
// XTYPE Instruction Class in V4.
// Definition of the instruction class NOT CHANGED.
@@ -294,6 +324,10 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
OpcodeHexagon;
+class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
// S Instruction Class in V2/V3.
// XTYPE Instruction Class in V4.
// Definition of the instruction class NOT CHANGED.
@@ -402,3 +436,13 @@ include "HexagonInstrFormatsV4.td"
//===----------------------------------------------------------------------===//
// V4 Instruction Format Definitions +
//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index db83ef6bc474..2d1dea526eed 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -21,8 +21,6 @@ def TypeMEMOP : IType<9>;
def TypeNV : IType<10>;
def TypeDUPLEX : IType<11>;
def TypeCOMPOUND : IType<12>;
-def TypeAG_VX : IType<28>;
-def TypeAG_VM : IType<29>;
def TypePREFIX : IType<30>;
// Duplex Instruction Class Declaration
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
new file mode 100644
index 000000000000..f3d43dec733e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -0,0 +1,238 @@
+//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+// Hexagon Intruction Flags +
+//
+// *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeCVI_VA : IType<13>;
+def TypeCVI_VA_DV : IType<14>;
+def TypeCVI_VX : IType<15>;
+def TypeCVI_VX_DV : IType<16>;
+def TypeCVI_VP : IType<17>;
+def TypeCVI_VP_VS : IType<18>;
+def TypeCVI_VS : IType<19>;
+def TypeCVI_VINLANESAT : IType<20>;
+def TypeCVI_VM_LD : IType<21>;
+def TypeCVI_VM_TMP_LD : IType<22>;
+def TypeCVI_VM_CUR_LD : IType<23>;
+def TypeCVI_VM_VP_LDU : IType<24>;
+def TypeCVI_VM_ST : IType<25>;
+def TypeCVI_VM_NEW_ST : IType<26>;
+def TypeCVI_VM_STU : IType<27>;
+def TypeCVI_HIST : IType<28>;
+//----------------------------------------------------------------------------//
+// Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_late<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_LATE>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV_SLOT2>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_EARLY>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_LONG>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VP_VS_LONG_EARLY>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VS>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VINLANESAT>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VS>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_TMP_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_TMP_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_CUR_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_VP_LDU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_VP_LDU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_NEW_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_NEW_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_STU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VM_STU>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_HIST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+ OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+}
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VA>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_VX_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+ Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource1<dag outs, dag ins, string asmstr,
+ list<dag> pattern = [], string cstr = "",
+ InstrItinClass itin = CVI_HIST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+ Requires<[HasV60T, UseHVX]>;
+}
+
+
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3cb082349b41..eb3590cb1076 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -23,9 +23,11 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include <cctype>
using namespace llvm;
@@ -36,9 +38,41 @@ using namespace llvm;
#include "HexagonGenInstrInfo.inc"
#include "HexagonGenDFAPacketizer.inc"
+using namespace llvm;
+
+cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
+ cl::init(false), cl::desc("Do not consider inline-asm a scheduling/"
+ "packetization boundary."));
+
+static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction",
+ cl::Hidden, cl::init(true), cl::desc("Enable branch prediction"));
+
+static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable schedule adjustment for new value stores."));
+
+static cl::opt<bool> EnableTimingClassLatency(
+ "enable-timing-class-latency", cl::Hidden, cl::init(false),
+ cl::desc("Enable timing class latency"));
+
+static cl::opt<bool> EnableALUForwarding(
+ "enable-alu-forwarding", cl::Hidden, cl::init(true),
+ cl::desc("Enable vec alu forwarding"));
+
+static cl::opt<bool> EnableACCForwarding(
+ "enable-acc-forwarding", cl::Hidden, cl::init(true),
+ cl::desc("Enable vec acc forwarding"));
+
+static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
+ cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm"));
+
///
/// Constants for Hexagon instructions.
///
+const int Hexagon_MEMV_OFFSET_MAX_128B = 2047; // #s7
+const int Hexagon_MEMV_OFFSET_MIN_128B = -2048; // #s7
+const int Hexagon_MEMV_OFFSET_MAX = 1023; // #s6
+const int Hexagon_MEMV_OFFSET_MIN = -1024; // #s6
const int Hexagon_MEMW_OFFSET_MAX = 4095;
const int Hexagon_MEMW_OFFSET_MIN = -4096;
const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -57,71 +91,49 @@ const int Hexagon_MEMH_AUTOINC_MAX = 14;
const int Hexagon_MEMH_AUTOINC_MIN = -16;
const int Hexagon_MEMB_AUTOINC_MAX = 7;
const int Hexagon_MEMB_AUTOINC_MIN = -8;
+const int Hexagon_MEMV_AUTOINC_MAX = 192;
+const int Hexagon_MEMV_AUTOINC_MIN = -256;
+const int Hexagon_MEMV_AUTOINC_MAX_128B = 384;
+const int Hexagon_MEMV_AUTOINC_MIN_128B = -512;
// Pin the vtable to this file.
void HexagonInstrInfo::anchor() {}
HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
: HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
- RI(), Subtarget(ST) {}
+ RI() {}
-/// isLoadFromStackSlot - If the specified machine instruction is a direct
-/// load from a stack slot, return the virtual or physical register number of
-/// the destination along with the FrameIndex of the loaded stack slot. If
-/// not, return 0. This predicate must return 0 if the instruction has
-/// any side effects other than loading from the stack slot.
-unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
+static bool isIntRegForSubInst(unsigned Reg) {
+ return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+ (Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
+}
- switch (MI->getOpcode()) {
- default: break;
- case Hexagon::L2_loadri_io:
- case Hexagon::L2_loadrd_io:
- case Hexagon::L2_loadrh_io:
- case Hexagon::L2_loadrb_io:
- case Hexagon::L2_loadrub_io:
- if (MI->getOperand(2).isFI() &&
- MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
- FrameIndex = MI->getOperand(2).getIndex();
- return MI->getOperand(0).getReg();
- }
- break;
- }
- return 0;
+
+static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) {
+ return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_loreg)) &&
+ isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_hireg));
}
-/// isStoreToStackSlot - If the specified machine instruction is a direct
-/// store to a stack slot, return the virtual or physical register number of
-/// the source reg along with the FrameIndex of the loaded stack slot. If
-/// not, return 0. This predicate must return 0 if the instruction has
-/// any side effects other than storing to the stack slot.
-unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
- int &FrameIndex) const {
- switch (MI->getOpcode()) {
- default: break;
- case Hexagon::S2_storeri_io:
- case Hexagon::S2_storerd_io:
- case Hexagon::S2_storerh_io:
- case Hexagon::S2_storerb_io:
- if (MI->getOperand(2).isFI() &&
- MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
- FrameIndex = MI->getOperand(0).getIndex();
- return MI->getOperand(2).getReg();
- }
- break;
+/// Calculate number of instructions excluding the debug instructions.
+static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
+ MachineBasicBlock::const_instr_iterator MIE) {
+ unsigned Count = 0;
+ for (; MIB != MIE; ++MIB) {
+ if (!MIB->isDebugValue())
+ ++Count;
}
- return 0;
+ return Count;
}
-// Find the hardware loop instruction used to set-up the specified loop.
-// On Hexagon, we have two instructions used to set-up the hardware loop
-// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
-// to indicate the end of a loop.
-static MachineInstr *
-findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
- SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+
+/// Find the hardware loop instruction used to set-up the specified loop.
+/// On Hexagon, we have two instructions used to set-up the hardware loop
+/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
+/// to indicate the end of a loop.
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
int LOOPi;
int LOOPr;
if (EndLoopOp == Hexagon::ENDLOOP0) {
@@ -157,100 +169,108 @@ findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
return 0;
}
-unsigned HexagonInstrInfo::InsertBranch(
- MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB,
- ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
- Opcode_t BOpc = Hexagon::J2_jump;
- Opcode_t BccOpc = Hexagon::J2_jumpt;
+/// Gather register def/uses from MI.
+/// This treats possible (predicated) defs as actually happening ones
+/// (conservatively).
+static inline void parseOperands(const MachineInstr *MI,
+ SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) {
+ Defs.clear();
+ Uses.clear();
- assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
- // Check if ReverseBranchCondition has asked to reverse this branch
- // If we want to reverse the branch an odd number of times, we want
- // J2_jumpf.
- if (!Cond.empty() && Cond[0].isImm())
- BccOpc = Cond[0].getImm();
+ if (!MO.isReg())
+ continue;
- if (!FBB) {
- if (Cond.empty()) {
- // Due to a bug in TailMerging/CFG Optimization, we need to add a
- // special case handling of a predicated jump followed by an
- // unconditional jump. If not, Tail Merging and CFG Optimization go
- // into an infinite loop.
- MachineBasicBlock *NewTBB, *NewFBB;
- SmallVector<MachineOperand, 4> Cond;
- MachineInstr *Term = MBB.getFirstTerminator();
- if (Term != MBB.end() && isPredicated(Term) &&
- !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
- MachineBasicBlock *NextBB =
- std::next(MachineFunction::iterator(&MBB));
- if (NewTBB == NextBB) {
- ReverseBranchCondition(Cond);
- RemoveBranch(MBB);
- return InsertBranch(MBB, TBB, nullptr, Cond, DL);
- }
- }
- BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
- } else if (isEndLoopN(Cond[0].getImm())) {
- int EndLoopOp = Cond[0].getImm();
- assert(Cond[1].isMBB());
- // Since we're adding an ENDLOOP, there better be a LOOP instruction.
- // Check for it, and change the BB target if needed.
- SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
- MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
- assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
- Loop->getOperand(0).setMBB(TBB);
- // Add the ENDLOOP after the finding the LOOP0.
- BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
- } else if (isNewValueJump(Cond[0].getImm())) {
- assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
- // New value jump
- // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
- // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
- unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
- DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
- if (Cond[2].isReg()) {
- unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
- BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
- addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
- } else if(Cond[2].isImm()) {
- BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
- addImm(Cond[2].getImm()).addMBB(TBB);
- } else
- llvm_unreachable("Invalid condition for branching");
- } else {
- assert((Cond.size() == 2) && "Malformed cond vector");
- const MachineOperand &RO = Cond[1];
- unsigned Flags = getUndefRegState(RO.isUndef());
- BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
- }
- return 1;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+
+ if (MO.isUse())
+ Uses.push_back(MO.getReg());
+
+ if (MO.isDef())
+ Defs.push_back(MO.getReg());
}
- assert((!Cond.empty()) &&
- "Cond. cannot be empty when multiple branchings are required");
- assert((!isNewValueJump(Cond[0].getImm())) &&
- "NV-jump cannot be inserted with another branch");
- // Special case for hardware loops. The condition is a basic block.
- if (isEndLoopN(Cond[0].getImm())) {
- int EndLoopOp = Cond[0].getImm();
- assert(Cond[1].isMBB());
- // Since we're adding an ENDLOOP, there better be a LOOP instruction.
- // Check for it, and change the BB target if needed.
- SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
- MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
- assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
- Loop->getOperand(0).setMBB(TBB);
- // Add the ENDLOOP after the finding the LOOP0.
- BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
- } else {
- const MachineOperand &RO = Cond[1];
- unsigned Flags = getUndefRegState(RO.isUndef());
- BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+}
+
+
+// Position dependent, so check twice for swap.
+static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+ switch (Ga) {
+ case HexagonII::HSIG_None:
+ default:
+ return false;
+ case HexagonII::HSIG_L1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_L2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S1:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_S2:
+ return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+ Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+ Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_A:
+ return (Gb == HexagonII::HSIG_A);
+ case HexagonII::HSIG_Compound:
+ return (Gb == HexagonII::HSIG_Compound);
}
- BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+ return false;
+}
- return 2;
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case Hexagon::L2_loadri_io:
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L2_loadrb_io:
+ case Hexagon::L2_loadrub_io:
+ if (MI->getOperand(2).isFI() &&
+ MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+ FrameIndex = MI->getOperand(2).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot. If
+/// not, return 0. This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerb_io:
+ if (MI->getOperand(2).isFI() &&
+ MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+ FrameIndex = MI->getOperand(0).getIndex();
+ return MI->getOperand(2).getReg();
+ }
+ break;
+ }
+ return 0;
}
@@ -269,9 +289,6 @@ unsigned HexagonInstrInfo::InsertBranch(
/// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode
/// Cond[1] = R
/// Cond[2] = Imm
-/// @note Related function is \fn findInstrPredicate which fills in
-/// Cond. vector when a predicated instruction is passed to it.
-/// We follow same protocol in that case too.
///
bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&TBB,
@@ -314,7 +331,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
return false;
--I;
}
-
+
bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump &&
I->getOperand(0).isMBB();
// Delete the J2_jump if it's equivalent to a fall-through.
@@ -327,17 +344,17 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
return false;
--I;
}
- if (!isUnpredicatedTerminator(I))
+ if (!isUnpredicatedTerminator(&*I))
return false;
// Get the last instruction in the block.
- MachineInstr *LastInst = I;
+ MachineInstr *LastInst = &*I;
MachineInstr *SecondLastInst = nullptr;
// Find one more terminator if present.
- do {
- if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) {
+ for (;;) {
+ if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
if (!SecondLastInst)
- SecondLastInst = I;
+ SecondLastInst = &*I;
else
// This is a third branch.
return true;
@@ -345,7 +362,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
if (I == MBB.instr_begin())
break;
--I;
- } while(I);
+ }
int LastOpcode = LastInst->getOpcode();
int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0;
@@ -418,7 +435,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
// executed, so remove it.
if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) {
TBB = SecondLastInst->getOperand(0).getMBB();
- I = LastInst;
+ I = LastInst->getIterator();
if (AllowModify)
I->eraseFromParent();
return false;
@@ -438,6 +455,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
return true;
}
+
unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber());
MachineBasicBlock::iterator I = MBB.end();
@@ -458,100 +476,127 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
return Count;
}
-/// \brief For a comparison instruction, return the source registers in
-/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
-/// compares against in CmpValue. Return true if the comparison instruction
-/// can be analyzed.
-bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
- unsigned &SrcReg, unsigned &SrcReg2,
- int &Mask, int &Value) const {
- unsigned Opc = MI->getOpcode();
- // Set mask and the first source register.
- switch (Opc) {
- case Hexagon::C2_cmpeq:
- case Hexagon::C2_cmpeqp:
- case Hexagon::C2_cmpgt:
- case Hexagon::C2_cmpgtp:
- case Hexagon::C2_cmpgtu:
- case Hexagon::C2_cmpgtup:
- case Hexagon::C4_cmpneq:
- case Hexagon::C4_cmplte:
- case Hexagon::C4_cmplteu:
- case Hexagon::C2_cmpeqi:
- case Hexagon::C2_cmpgti:
- case Hexagon::C2_cmpgtui:
- case Hexagon::C4_cmpneqi:
- case Hexagon::C4_cmplteui:
- case Hexagon::C4_cmpltei:
- SrcReg = MI->getOperand(1).getReg();
- Mask = ~0;
- break;
- case Hexagon::A4_cmpbeq:
- case Hexagon::A4_cmpbgt:
- case Hexagon::A4_cmpbgtu:
- case Hexagon::A4_cmpbeqi:
- case Hexagon::A4_cmpbgti:
- case Hexagon::A4_cmpbgtui:
- SrcReg = MI->getOperand(1).getReg();
- Mask = 0xFF;
- break;
- case Hexagon::A4_cmpheq:
- case Hexagon::A4_cmphgt:
- case Hexagon::A4_cmphgtu:
- case Hexagon::A4_cmpheqi:
- case Hexagon::A4_cmphgti:
- case Hexagon::A4_cmphgtui:
- SrcReg = MI->getOperand(1).getReg();
- Mask = 0xFFFF;
- break;
- }
+unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+ unsigned BOpc = Hexagon::J2_jump;
+ unsigned BccOpc = Hexagon::J2_jumpt;
+ assert(validateBranchCond(Cond) && "Invalid branching condition");
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
- // Set the value/second source register.
- switch (Opc) {
- case Hexagon::C2_cmpeq:
- case Hexagon::C2_cmpeqp:
- case Hexagon::C2_cmpgt:
- case Hexagon::C2_cmpgtp:
- case Hexagon::C2_cmpgtu:
- case Hexagon::C2_cmpgtup:
- case Hexagon::A4_cmpbeq:
- case Hexagon::A4_cmpbgt:
- case Hexagon::A4_cmpbgtu:
- case Hexagon::A4_cmpheq:
- case Hexagon::A4_cmphgt:
- case Hexagon::A4_cmphgtu:
- case Hexagon::C4_cmpneq:
- case Hexagon::C4_cmplte:
- case Hexagon::C4_cmplteu:
- SrcReg2 = MI->getOperand(2).getReg();
- return true;
+ // Check if ReverseBranchCondition has asked to reverse this branch
+ // If we want to reverse the branch an odd number of times, we want
+ // J2_jumpf.
+ if (!Cond.empty() && Cond[0].isImm())
+ BccOpc = Cond[0].getImm();
- case Hexagon::C2_cmpeqi:
- case Hexagon::C2_cmpgtui:
- case Hexagon::C2_cmpgti:
- case Hexagon::C4_cmpneqi:
- case Hexagon::C4_cmplteui:
- case Hexagon::C4_cmpltei:
- case Hexagon::A4_cmpbeqi:
- case Hexagon::A4_cmpbgti:
- case Hexagon::A4_cmpbgtui:
- case Hexagon::A4_cmpheqi:
- case Hexagon::A4_cmphgti:
- case Hexagon::A4_cmphgtui:
- SrcReg2 = 0;
- Value = MI->getOperand(2).getImm();
- return true;
+ if (!FBB) {
+ if (Cond.empty()) {
+ // Due to a bug in TailMerging/CFG Optimization, we need to add a
+ // special case handling of a predicated jump followed by an
+ // unconditional jump. If not, Tail Merging and CFG Optimization go
+ // into an infinite loop.
+ MachineBasicBlock *NewTBB, *NewFBB;
+ SmallVector<MachineOperand, 4> Cond;
+ MachineInstr *Term = MBB.getFirstTerminator();
+ if (Term != MBB.end() && isPredicated(Term) &&
+ !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) {
+ MachineBasicBlock *NextBB = &*++MBB.getIterator();
+ if (NewTBB == NextBB) {
+ ReverseBranchCondition(Cond);
+ RemoveBranch(MBB);
+ return InsertBranch(MBB, TBB, nullptr, Cond, DL);
+ }
+ }
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+ } else if (isEndLoopN(Cond[0].getImm())) {
+ int EndLoopOp = Cond[0].getImm();
+ assert(Cond[1].isMBB());
+ // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+ // Check for it, and change the BB target if needed.
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+ assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+ Loop->getOperand(0).setMBB(TBB);
+ // Add the ENDLOOP after the finding the LOOP0.
+ BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+ } else if (isNewValueJump(Cond[0].getImm())) {
+ assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
+ // New value jump
+ // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
+ // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
+ unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
+ DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
+ if (Cond[2].isReg()) {
+ unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+ addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
+ } else if(Cond[2].isImm()) {
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+ addImm(Cond[2].getImm()).addMBB(TBB);
+ } else
+ llvm_unreachable("Invalid condition for branching");
+ } else {
+ assert((Cond.size() == 2) && "Malformed cond vector");
+ const MachineOperand &RO = Cond[1];
+ unsigned Flags = getUndefRegState(RO.isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+ }
+ return 1;
}
+ assert((!Cond.empty()) &&
+ "Cond. cannot be empty when multiple branchings are required");
+ assert((!isNewValueJump(Cond[0].getImm())) &&
+ "NV-jump cannot be inserted with another branch");
+ // Special case for hardware loops. The condition is a basic block.
+ if (isEndLoopN(Cond[0].getImm())) {
+ int EndLoopOp = Cond[0].getImm();
+ assert(Cond[1].isMBB());
+ // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+ // Check for it, and change the BB target if needed.
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+ assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+ Loop->getOperand(0).setMBB(TBB);
+ // Add the ENDLOOP after the finding the LOOP0.
+ BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+ } else {
+ const MachineOperand &RO = Cond[1];
+ unsigned Flags = getUndefRegState(RO.isUndef());
+ BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+ }
+ BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
- return false;
+ return 2;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+ unsigned NumCycles, unsigned ExtraPredCycles,
+ BranchProbability Probability) const {
+ return nonDbgBBSize(&MBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB,
+ unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability)
+ const {
+ return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+ unsigned NumInstrs, BranchProbability Probability) const {
+ return NumInstrs <= 4;
}
void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const {
+ MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ auto &HRI = getRegisterInfo();
if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg);
return;
@@ -599,28 +644,74 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
addReg(SrcReg, getKillRegState(KillSrc));
return;
}
+ if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+ Hexagon::IntRegsRegClass.contains(DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg).
+ addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg).
+ addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg).
+ addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
+ getKillRegState(KillSrc)).
+ addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
+ getKillRegState(KillSrc));
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg).
+ addReg(SrcReg).
+ addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(SrcReg) &&
+ Hexagon::VectorRegsRegClass.contains(DestReg)) {
+ llvm_unreachable("Unimplemented pred to vec");
+ return;
+ }
+ if (Hexagon::VecPredRegsRegClass.contains(DestReg) &&
+ Hexagon::VectorRegsRegClass.contains(SrcReg)) {
+ llvm_unreachable("Unimplemented vec to pred");
+ return;
+ }
+ if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) {
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
+ HRI.getSubReg(DestReg, Hexagon::subreg_hireg)).
+ addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg),
+ getKillRegState(KillSrc));
+ BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and),
+ HRI.getSubReg(DestReg, Hexagon::subreg_loreg)).
+ addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg),
+ getKillRegState(KillSrc));
+ return;
+ }
+#ifndef NDEBUG
+ // Show the invalid registers to ease debugging.
+ dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber()
+ << ": " << PrintReg(DestReg, &HRI)
+ << " = " << PrintReg(SrcReg, &HRI) << '\n';
+#endif
llvm_unreachable("Unimplemented");
}
-void HexagonInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned SrcReg, bool isKill, int FI,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
-
+void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
+ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBB.findDebugLoc(I);
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(
- MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FI),
- Align);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), Align);
if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
@@ -640,33 +731,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
}
-void HexagonInstrInfo::storeRegToAddr(
- MachineFunction &MF, unsigned SrcReg,
- bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const
-{
- llvm_unreachable("Unimplemented");
-}
-
-
-void HexagonInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned DestReg, int FI,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, unsigned DestReg, int FI,
+ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBB.findDebugLoc(I);
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(
- MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI),
- Align);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), Align);
if (RC == &Hexagon::IntRegsRegClass) {
BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
@@ -682,27 +757,136 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
}
-void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const {
- llvm_unreachable("Unimplemented");
-}
-bool
-HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
- const HexagonRegisterInfo &TRI = getRegisterInfo();
+/// expandPostRAPseudo - This function is called for all pseudo instructions
+/// that remain after register allocation. Many pseudo instructions are
+/// created to help register allocation. This is the place to convert them
+/// into real instructions. The target can edit MI in place, or it can insert
+/// new instructions and erase MI. The function should return true if
+/// anything was changed.
+bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI)
+ const {
+ const HexagonRegisterInfo &HRI = getRegisterInfo();
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
unsigned Opc = MI->getOpcode();
+ const unsigned VecOffset = 1;
+ bool Is128B = false;
switch (Opc) {
case Hexagon::ALIGNA:
BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI->getOperand(0).getReg())
- .addReg(TRI.getFrameRegister())
+ .addReg(HRI.getFrameRegister())
.addImm(-MI->getOperand(1).getImm());
MBB.erase(MI);
return true;
+ case Hexagon::HEXAGON_V6_vassignp_128B:
+ case Hexagon::HEXAGON_V6_vassignp: {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (SrcReg != DstReg)
+ copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI->getOperand(1).isKill());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::HEXAGON_V6_lo_128B:
+ case Hexagon::HEXAGON_V6_lo: {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
+ copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI->getOperand(1).isKill());
+ MBB.erase(MI);
+ MRI.clearKillFlags(SrcSubLo);
+ return true;
+ }
+ case Hexagon::HEXAGON_V6_hi_128B:
+ case Hexagon::HEXAGON_V6_hi: {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
+ copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI->getOperand(1).isKill());
+ MBB.erase(MI);
+ MRI.clearKillFlags(SrcSubHi);
+ return true;
+ }
+ case Hexagon::STrivv_indexed_128B:
+ Is128B = true;
+ case Hexagon::STrivv_indexed: {
+ unsigned SrcReg = MI->getOperand(2).getReg();
+ unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg);
+ unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg);
+ unsigned NewOpcd = Is128B ? Hexagon::V6_vS32b_ai_128B
+ : Hexagon::V6_vS32b_ai;
+ unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+ MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpcd))
+ .addOperand(MI->getOperand(0))
+ .addImm(MI->getOperand(1).getImm())
+ .addReg(SrcSubLo)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MI1New->getOperand(0).setIsKill(false);
+ BuildMI(MBB, MI, DL, get(NewOpcd))
+ .addOperand(MI->getOperand(0))
+ // The Vectors are indexed in multiples of vector size.
+ .addImm(MI->getOperand(1).getImm()+Offset)
+ .addReg(SrcSubHi)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::LDrivv_pseudo_V6_128B:
+ case Hexagon::LDrivv_indexed_128B:
+ Is128B = true;
+ case Hexagon::LDrivv_pseudo_V6:
+ case Hexagon::LDrivv_indexed: {
+ unsigned NewOpcd = Is128B ? Hexagon::V6_vL32b_ai_128B
+ : Hexagon::V6_vL32b_ai;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+ MachineInstr *MI1New =
+ BuildMI(MBB, MI, DL, get(NewOpcd),
+ HRI.getSubReg(DstReg, Hexagon::subreg_loreg))
+ .addOperand(MI->getOperand(1))
+ .addImm(MI->getOperand(2).getImm());
+ MI1New->getOperand(1).setIsKill(false);
+ BuildMI(MBB, MI, DL, get(NewOpcd),
+ HRI.getSubReg(DstReg, Hexagon::subreg_hireg))
+ .addOperand(MI->getOperand(1))
+ // The Vectors are indexed in multiples of vector size.
+ .addImm(MI->getOperand(2).getImm() + Offset)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::LDriv_pseudo_V6_128B:
+ Is128B = true;
+ case Hexagon::LDriv_pseudo_V6: {
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B
+ : Hexagon::V6_vL32b_ai;
+ int32_t Off = MI->getOperand(2).getImm();
+ int32_t Idx = Off;
+ BuildMI(MBB, MI, DL, get(NewOpc), DstReg)
+ .addOperand(MI->getOperand(1))
+ .addImm(Idx)
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
+ case Hexagon::STriv_pseudo_V6_128B:
+ Is128B = true;
+ case Hexagon::STriv_pseudo_V6: {
+ unsigned NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B
+ : Hexagon::V6_vS32b_ai;
+ int32_t Off = MI->getOperand(1).getImm();
+ int32_t Idx = Is128B ? (Off >> 7) : (Off >> 6);
+ BuildMI(MBB, MI, DL, get(NewOpc))
+ .addOperand(MI->getOperand(0))
+ .addImm(Idx)
+ .addOperand(MI->getOperand(2))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MBB.erase(MI);
+ return true;
+ }
case Hexagon::TFR_PdTrue: {
unsigned Reg = MI->getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
@@ -724,15 +908,15 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
unsigned DstReg = MI->getOperand(0).getReg();
unsigned Src1Reg = MI->getOperand(1).getReg();
unsigned Src2Reg = MI->getOperand(2).getReg();
- unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
- unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
- unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
- unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+ unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+ unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+ unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+ unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
- TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+ HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
.addReg(Src2SubHi);
BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi),
- TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+ HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
.addReg(Src2SubLo);
MBB.erase(MI);
MRI.clearKillFlags(Src1SubHi);
@@ -747,17 +931,17 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
unsigned Src1Reg = MI->getOperand(1).getReg();
unsigned Src2Reg = MI->getOperand(2).getReg();
unsigned Src3Reg = MI->getOperand(3).getReg();
- unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
- unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
- unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
- unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
- unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
- unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
+ unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg);
+ unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg);
+ unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg);
+ unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg);
+ unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::subreg_hireg);
+ unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::subreg_loreg);
BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
- TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
+ HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi)
.addReg(Src2SubHi).addReg(Src3SubHi);
BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci),
- TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
+ HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo)
.addReg(Src2SubLo).addReg(Src3SubLo);
MBB.erase(MI);
MRI.clearKillFlags(Src1SubHi);
@@ -768,104 +952,168 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MRI.clearKillFlags(Src3SubLo);
return true;
}
+ case Hexagon::MUX64_rr: {
+ const MachineOperand &Op0 = MI->getOperand(0);
+ const MachineOperand &Op1 = MI->getOperand(1);
+ const MachineOperand &Op2 = MI->getOperand(2);
+ const MachineOperand &Op3 = MI->getOperand(3);
+ unsigned Rd = Op0.getReg();
+ unsigned Pu = Op1.getReg();
+ unsigned Rs = Op2.getReg();
+ unsigned Rt = Op3.getReg();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned K1 = getKillRegState(Op1.isKill());
+ unsigned K2 = getKillRegState(Op2.isKill());
+ unsigned K3 = getKillRegState(Op3.isKill());
+ if (Rd != Rs)
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd)
+ .addReg(Pu, (Rd == Rt) ? K1 : 0)
+ .addReg(Rs, K2);
+ if (Rd != Rt)
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd)
+ .addReg(Pu, K1)
+ .addReg(Rt, K3);
+ MBB.erase(MI);
+ return true;
+ }
case Hexagon::TCRETURNi:
MI->setDesc(get(Hexagon::J2_jump));
return true;
case Hexagon::TCRETURNr:
MI->setDesc(get(Hexagon::J2_jumpr));
return true;
+ case Hexagon::TFRI_f:
+ case Hexagon::TFRI_cPt_f:
+ case Hexagon::TFRI_cNotPt_f: {
+ unsigned Opx = (Opc == Hexagon::TFRI_f) ? 1 : 2;
+ APFloat FVal = MI->getOperand(Opx).getFPImm()->getValueAPF();
+ APInt IVal = FVal.bitcastToAPInt();
+ MI->RemoveOperand(Opx);
+ unsigned NewOpc = (Opc == Hexagon::TFRI_f) ? Hexagon::A2_tfrsi :
+ (Opc == Hexagon::TFRI_cPt_f) ? Hexagon::C2_cmoveit :
+ Hexagon::C2_cmoveif;
+ MI->setDesc(get(NewOpc));
+ MI->addOperand(MachineOperand::CreateImm(IVal.getZExtValue()));
+ return true;
+ }
}
return false;
}
-MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(
- MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt, int FI) const {
- // Hexagon_TODO: Implement.
- return nullptr;
+
+// We indicate that we want to reverse the branch by
+// inserting the reversed branching opcode.
+bool HexagonInstrInfo::ReverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond.empty())
+ return true;
+ assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
+ unsigned opcode = Cond[0].getImm();
+ //unsigned temp;
+ assert(get(opcode).isBranch() && "Should be a branching condition.");
+ if (isEndLoopN(opcode))
+ return true;
+ unsigned NewOpcode = getInvertedPredicatedOpcode(opcode);
+ Cond[0].setImm(NewOpcode);
+ return false;
}
-unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
- const TargetRegisterClass *TRC;
- if (VT == MVT::i1) {
- TRC = &Hexagon::PredRegsRegClass;
- } else if (VT == MVT::i32 || VT == MVT::f32) {
- TRC = &Hexagon::IntRegsRegClass;
- } else if (VT == MVT::i64 || VT == MVT::f64) {
- TRC = &Hexagon::DoubleRegsRegClass;
- } else {
- llvm_unreachable("Cannot handle this register class");
- }
+void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ DebugLoc DL;
+ BuildMI(MBB, MI, DL, get(Hexagon::A2_nop));
+}
- unsigned NewReg = RegInfo.createVirtualRegister(TRC);
- return NewReg;
+
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
+// Note: New-value stores are not included here as in the current
+// implementation, we don't need to check their predicate sense.
+bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
}
-bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
- const MCInstrDesc &MID = MI->getDesc();
- const uint64_t F = MID.TSFlags;
- if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
- return true;
- // TODO: This is largely obsolete now. Will need to be removed
- // in consecutive patches.
- switch(MI->getOpcode()) {
- // TFR_FI Remains a special case.
- case Hexagon::TFR_FI:
- return true;
- default:
- return false;
+bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI,
+ ArrayRef<MachineOperand> Cond) const {
+ if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
+ isEndLoopN(Cond[0].getImm())) {
+ DEBUG(dbgs() << "\nCannot predicate:"; MI->dump(););
+ return false;
}
- return false;
-}
+ int Opc = MI->getOpcode();
+ assert (isPredicable(MI) && "Expected predicable instruction");
+ bool invertJump = predOpcodeHasNot(Cond);
-// This returns true in two cases:
-// - The OP code itself indicates that this is an extended instruction.
-// - One of MOs has been marked with HMOTF_ConstExtended flag.
-bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
- // First check if this is permanently extended op code.
- const uint64_t F = MI->getDesc().TSFlags;
- if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
- return true;
- // Use MO operand flags to determine if one of MI's operands
- // has HMOTF_ConstExtended flag set.
- for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
- E = MI->operands_end(); I != E; ++I) {
- if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
- return true;
+ // We have to predicate MI "in place", i.e. after this function returns,
+ // MI will need to be transformed into a predicated form. To avoid com-
+ // plicated manipulations with the operands (handling tied operands,
+ // etc.), build a new temporary instruction, then overwrite MI with it.
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned PredOpc = getCondOpcode(Opc, invertJump);
+ MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
+ unsigned NOp = 0, NumOps = MI->getNumOperands();
+ while (NOp < NumOps) {
+ MachineOperand &Op = MI->getOperand(NOp);
+ if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+ break;
+ T.addOperand(Op);
+ NOp++;
}
- return false;
-}
-bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const {
- return MI->getDesc().isBranch();
-}
+ unsigned PredReg, PredRegPos, PredRegFlags;
+ bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
+ (void)GotPredReg;
+ assert(GotPredReg);
+ T.addReg(PredReg, PredRegFlags);
+ while (NOp < NumOps)
+ T.addOperand(MI->getOperand(NOp++));
-bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
- if (isNewValueJump(MI))
- return true;
+ MI->setDesc(get(PredOpc));
+ while (unsigned n = MI->getNumOperands())
+ MI->RemoveOperand(n-1);
+ for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
+ MI->addOperand(T->getOperand(i));
- if (isNewValueStore(MI))
- return true;
+ MachineBasicBlock::instr_iterator TI = T->getIterator();
+ B.erase(TI);
- return false;
+ MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+ MRI.clearKillFlags(PredReg);
+ return true;
}
-bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
- return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
-}
-bool HexagonInstrInfo::isNewValue(Opcode_t Opcode) const {
- const uint64_t F = get(Opcode).TSFlags;
- return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const {
+ // TODO: Fix this
+ return false;
}
-bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
- return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+
+bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
+ std::vector<MachineOperand> &Pred) const {
+ auto &HRI = getRegisterInfo();
+ for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
+ MachineOperand MO = MI->getOperand(oper);
+ if (MO.isReg() && MO.isDef()) {
+ const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
+ if (RC == &Hexagon::PredRegsRegClass) {
+ Pred.push_back(MO);
+ return true;
+ }
+ }
+ }
+ return false;
}
bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
@@ -875,10 +1123,21 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
return false;
const int Opc = MI->getOpcode();
+ int NumOperands = MI->getNumOperands();
+
+ // Keep a flag for upto 4 operands in the instructions, to indicate if
+ // that operand has been constant extended.
+ bool OpCExtended[4];
+ if (NumOperands > 4)
+ NumOperands = 4;
+
+ for (int i = 0; i < NumOperands; i++)
+ OpCExtended[i] = (isOperandExtended(MI, i) && isConstExtended(MI));
switch(Opc) {
case Hexagon::A2_tfrsi:
- return (isOperandExtended(MI, 1) && isConstExtended(MI)) || isInt<12>(MI->getOperand(1).getImm());
+ return (isOperandExtended(MI, 1) && isConstExtended(MI)) ||
+ isInt<12>(MI->getOperand(1).getImm());
case Hexagon::S2_storerd_io:
return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
@@ -926,8 +1185,8 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
case Hexagon::S4_storeirb_io:
case Hexagon::S4_storeirh_io:
case Hexagon::S4_storeiri_io:
- return (isUInt<6>(MI->getOperand(1).getImm()) &&
- isInt<6>(MI->getOperand(2).getImm()));
+ return (OpCExtended[1] || isUInt<6>(MI->getOperand(1).getImm())) &&
+ (OpCExtended[2] || isInt<6>(MI->getOperand(2).getImm()));
case Hexagon::A2_addi:
return isInt<8>(MI->getOperand(2).getImm());
@@ -944,269 +1203,1117 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
return true;
}
-// This function performs the following inversiones:
-//
-// cPt ---> cNotPt
-// cNotPt ---> cPt
-//
-unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
- int InvPredOpcode;
- InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
- : Hexagon::getTruePredOpcode(Opc);
- if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
- return InvPredOpcode;
- switch(Opc) {
- default: llvm_unreachable("Unexpected predicated instruction");
- case Hexagon::C2_ccombinewt:
- return Hexagon::C2_ccombinewf;
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+ const MachineBasicBlock *MBB, const MachineFunction &MF) const {
+ // Debug info is never a scheduling boundary. It's necessary to be explicit
+ // due to the special treatment of IT instructions below, otherwise a
+ // dbg_value followed by an IT will result in the IT instruction being
+ // considered a scheduling hazard, which is wrong. It should be the actual
+ // instruction preceding the dbg_value instruction(s), just like it is
+ // when debug info is not present.
+ if (MI->isDebugValue())
+ return false;
+
+ // Throwing call is a boundary.
+ if (MI->isCall()) {
+ // If any of the block's successors is a landing pad, this could be a
+ // throwing call.
+ for (auto I : MBB->successors())
+ if (I->isEHPad())
+ return true;
+ }
+
+ // Don't mess around with no return calls.
+ if (MI->getOpcode() == Hexagon::CALLv3nr)
+ return true;
+
+ // Terminators and labels can't be scheduled around.
+ if (MI->getDesc().isTerminator() || MI->isPosition())
+ return true;
+
+ if (MI->isInlineAsm() && !ScheduleInlineAsm)
+ return true;
+
+ return false;
+}
+
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// Hexagon counts the number of ##'s and adjust for that many
+/// constant exenders.
+unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const {
+ StringRef AStr(Str);
+ // Count the number of instructions in the asm.
+ bool atInsnStart = true;
+ unsigned Length = 0;
+ for (; *Str; ++Str) {
+ if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+ strlen(MAI.getSeparatorString())) == 0)
+ atInsnStart = true;
+ if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+ Length += MAI.getMaxInstLength();
+ atInsnStart = false;
+ }
+ if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+ strlen(MAI.getCommentString())) == 0)
+ atInsnStart = false;
+ }
+
+ // Add to size number of constant extenders seen * 4.
+ StringRef Occ("##");
+ Length += AStr.count(Occ)*4;
+ return Length;
+}
+
+
+ScheduleHazardRecognizer*
+HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
+ const InstrItineraryData *II, const ScheduleDAG *DAG) const {
+ return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
+ unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const {
+ unsigned Opc = MI->getOpcode();
+
+ // Set mask and the first source register.
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::C4_cmpltei:
+ SrcReg = MI->getOperand(1).getReg();
+ Mask = ~0;
+ break;
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmpbgtui:
+ SrcReg = MI->getOperand(1).getReg();
+ Mask = 0xFF;
+ break;
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::A4_cmphgtui:
+ SrcReg = MI->getOperand(1).getReg();
+ Mask = 0xFFFF;
+ break;
+ }
+
+ // Set the value/second source register.
+ switch (Opc) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqp:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtp:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtup:
+ case Hexagon::A4_cmpbeq:
+ case Hexagon::A4_cmpbgt:
+ case Hexagon::A4_cmpbgtu:
+ case Hexagon::A4_cmpheq:
+ case Hexagon::A4_cmphgt:
+ case Hexagon::A4_cmphgtu:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ SrcReg2 = MI->getOperand(2).getReg();
+ return true;
+
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C4_cmpneqi:
+ case Hexagon::C4_cmplteui:
+ case Hexagon::C4_cmpltei:
+ case Hexagon::A4_cmpbeqi:
+ case Hexagon::A4_cmpbgti:
+ case Hexagon::A4_cmpbgtui:
+ case Hexagon::A4_cmpheqi:
+ case Hexagon::A4_cmphgti:
+ case Hexagon::A4_cmphgtui:
+ SrcReg2 = 0;
+ Value = MI->getOperand(2).getImm();
+ return true;
+ }
+
+ return false;
+}
+
+
+unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr *MI, unsigned *PredCost) const {
+ return getInstrTimingClassLatency(ItinData, MI);
+}
+
+
+DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
+ const TargetSubtargetInfo &STI) const {
+ const InstrItineraryData *II = STI.getInstrItineraryData();
+ return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II);
+}
+
+
+// Inspired by this pair:
+// %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
+// S2_storeri_io %R29, 132, %R1<kill>; flags: mem:ST4[FixedStack1]
+// Currently AA considers the addresses in these instructions to be aliasing.
+bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+ MachineInstr *MIb, AliasAnalysis *AA) const {
+ int OffsetA = 0, OffsetB = 0;
+ unsigned SizeA = 0, SizeB = 0;
+
+ if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
+ MIa->hasOrderedMemoryRef() || MIa->hasOrderedMemoryRef())
+ return false;
+
+ // Instructions that are pure loads, not loads and stores like memops are not
+ // dependent.
+ if (MIa->mayLoad() && !isMemOp(MIa) && MIb->mayLoad() && !isMemOp(MIb))
+ return true;
+
+ // Get base, offset, and access size in MIa.
+ unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA);
+ if (!BaseRegA || !SizeA)
+ return false;
+
+ // Get base, offset, and access size in MIb.
+ unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB);
+ if (!BaseRegB || !SizeB)
+ return false;
+
+ if (BaseRegA != BaseRegB)
+ return false;
+
+ // This is a mem access with the same base register and known offsets from it.
+ // Reason about it.
+ if (OffsetA > OffsetB) {
+ uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB);
+ return (SizeB <= offDiff);
+ } else if (OffsetA < OffsetB) {
+ uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA);
+ return (SizeA <= offDiff);
+ }
+
+ return false;
+}
+
+
+unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *TRC;
+ if (VT == MVT::i1) {
+ TRC = &Hexagon::PredRegsRegClass;
+ } else if (VT == MVT::i32 || VT == MVT::f32) {
+ TRC = &Hexagon::IntRegsRegClass;
+ } else if (VT == MVT::i64 || VT == MVT::f64) {
+ TRC = &Hexagon::DoubleRegsRegClass;
+ } else {
+ llvm_unreachable("Cannot handle this register class");
+ }
+
+ unsigned NewReg = MRI.createVirtualRegister(TRC);
+ return NewReg;
+}
+
+
+bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr* MI) const {
+ return (getAddrMode(MI) == HexagonII::AbsoluteSet);
+}
+
+
+bool HexagonInstrInfo::isAccumulator(const MachineInstr *MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
+
+bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const {
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+ if (!(isTC1(MI))
+ && !(QII->isTC2Early(MI))
+ && !(MI->getDesc().mayLoad())
+ && !(MI->getDesc().mayStore())
+ && (MI->getDesc().getOpcode() != Hexagon::S2_allocframe)
+ && (MI->getDesc().getOpcode() != Hexagon::L2_deallocframe)
+ && !(QII->isMemOp(MI))
+ && !(MI->isBranch())
+ && !(MI->isReturn())
+ && !MI->isCall())
+ return true;
+
+ return false;
+}
+
+
+// Return true if the instruction is a compund branch instruction.
+bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const {
+ return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch());
+}
+
+
+bool HexagonInstrInfo::isCondInst(const MachineInstr *MI) const {
+ return (MI->isBranch() && isPredicated(MI)) ||
+ isConditionalTransfer(MI) ||
+ isConditionalALU32(MI) ||
+ isConditionalLoad(MI) ||
+ // Predicated stores which don't have a .new on any operands.
+ (MI->mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
+ !isPredicatedNew(MI));
+}
+
+
+bool HexagonInstrInfo::isConditionalALU32(const MachineInstr* MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::A2_paddf:
+ case Hexagon::A2_paddfnew:
+ case Hexagon::A2_paddif:
+ case Hexagon::A2_paddifnew:
+ case Hexagon::A2_paddit:
+ case Hexagon::A2_padditnew:
+ case Hexagon::A2_paddt:
+ case Hexagon::A2_paddtnew:
+ case Hexagon::A2_pandf:
+ case Hexagon::A2_pandfnew:
+ case Hexagon::A2_pandt:
+ case Hexagon::A2_pandtnew:
+ case Hexagon::A2_porf:
+ case Hexagon::A2_porfnew:
+ case Hexagon::A2_port:
+ case Hexagon::A2_portnew:
+ case Hexagon::A2_psubf:
+ case Hexagon::A2_psubfnew:
+ case Hexagon::A2_psubt:
+ case Hexagon::A2_psubtnew:
+ case Hexagon::A2_pxorf:
+ case Hexagon::A2_pxorfnew:
+ case Hexagon::A2_pxort:
+ case Hexagon::A2_pxortnew:
+ case Hexagon::A4_paslhf:
+ case Hexagon::A4_paslhfnew:
+ case Hexagon::A4_paslht:
+ case Hexagon::A4_paslhtnew:
+ case Hexagon::A4_pasrhf:
+ case Hexagon::A4_pasrhfnew:
+ case Hexagon::A4_pasrht:
+ case Hexagon::A4_pasrhtnew:
+ case Hexagon::A4_psxtbf:
+ case Hexagon::A4_psxtbfnew:
+ case Hexagon::A4_psxtbt:
+ case Hexagon::A4_psxtbtnew:
+ case Hexagon::A4_psxthf:
+ case Hexagon::A4_psxthfnew:
+ case Hexagon::A4_psxtht:
+ case Hexagon::A4_psxthtnew:
+ case Hexagon::A4_pzxtbf:
+ case Hexagon::A4_pzxtbfnew:
+ case Hexagon::A4_pzxtbt:
+ case Hexagon::A4_pzxtbtnew:
+ case Hexagon::A4_pzxthf:
+ case Hexagon::A4_pzxthfnew:
+ case Hexagon::A4_pzxtht:
+ case Hexagon::A4_pzxthtnew:
case Hexagon::C2_ccombinewf:
- return Hexagon::C2_ccombinewt;
+ case Hexagon::C2_ccombinewt:
+ return true;
+ }
+ return false;
+}
+
+
+// FIXME - Function name and it's functionality don't match.
+// It should be renamed to hasPredNewOpcode()
+bool HexagonInstrInfo::isConditionalLoad(const MachineInstr* MI) const {
+ if (!MI->getDesc().mayLoad() || !isPredicated(MI))
+ return false;
+
+ int PNewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
+ // Instruction with valid predicated-new opcode can be promoted to .new.
+ return PNewOpcode >= 0;
+}
+
- // Dealloc_return.
- case Hexagon::L4_return_t:
- return Hexagon::L4_return_f;
- case Hexagon::L4_return_f:
- return Hexagon::L4_return_t;
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+bool HexagonInstrInfo::isConditionalStore(const MachineInstr* MI) const {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case Hexagon::S4_storeirbt_io:
+ case Hexagon::S4_storeirbf_io:
+ case Hexagon::S4_pstorerbt_rr:
+ case Hexagon::S4_pstorerbf_rr:
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ case Hexagon::S2_pstorerbt_pi:
+ case Hexagon::S2_pstorerbf_pi:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io:
+ case Hexagon::S4_pstorerdt_rr:
+ case Hexagon::S4_pstorerdf_rr:
+ case Hexagon::S2_pstorerdt_pi:
+ case Hexagon::S2_pstorerdf_pi:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ case Hexagon::S4_storeirht_io:
+ case Hexagon::S4_storeirhf_io:
+ case Hexagon::S4_pstorerht_rr:
+ case Hexagon::S4_pstorerhf_rr:
+ case Hexagon::S2_pstorerht_pi:
+ case Hexagon::S2_pstorerhf_pi:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ case Hexagon::S4_storeirit_io:
+ case Hexagon::S4_storeirif_io:
+ case Hexagon::S4_pstorerit_rr:
+ case Hexagon::S4_pstorerif_rr:
+ case Hexagon::S2_pstorerit_pi:
+ case Hexagon::S2_pstorerif_pi:
+
+ // V4 global address store before promoting to dot new.
+ case Hexagon::S4_pstorerdt_abs:
+ case Hexagon::S4_pstorerdf_abs:
+ case Hexagon::S4_pstorerbt_abs:
+ case Hexagon::S4_pstorerbf_abs:
+ case Hexagon::S4_pstorerht_abs:
+ case Hexagon::S4_pstorerhf_abs:
+ case Hexagon::S4_pstorerit_abs:
+ case Hexagon::S4_pstorerif_abs:
+ return true;
+
+ // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+ // from the "Conditional Store" list. Because a predicated new value store
+ // would NOT be promoted to a double dot new store.
+ // This function returns yes for those stores that are predicated but not
+ // yet promoted to predicate dot new instructions.
}
}
-// New Value Store instructions.
-bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+
+bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::A2_tfrt:
+ case Hexagon::A2_tfrf:
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmoveif:
+ case Hexagon::A2_tfrtnew:
+ case Hexagon::A2_tfrfnew:
+ case Hexagon::C2_cmovenewit:
+ case Hexagon::C2_cmovenewif:
+ case Hexagon::A2_tfrpt:
+ case Hexagon::A2_tfrpf:
+ return true;
+
+ default:
+ return false;
+ }
+ return false;
+}
+
+
+// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
+// isFPImm and later getFPImm as well.
+bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const {
const uint64_t F = MI->getDesc().TSFlags;
+ unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+ if (isExtended) // Instruction must be extended.
+ return true;
+
+ unsigned isExtendable =
+ (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+ if (!isExtendable)
+ return false;
+
+ if (MI->isCall())
+ return false;
+
+ short ExtOpNum = getCExtOpNum(MI);
+ const MachineOperand &MO = MI->getOperand(ExtOpNum);
+ // Use MO operand flags to determine if MO
+ // has the HMOTF_ConstExtended flag set.
+ if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+ return true;
+ // If this is a Machine BB address we are talking about, and it is
+ // not marked as extended, say so.
+ if (MO.isMBB())
+ return false;
+
+ // We could be using an instruction with an extendable immediate and shoehorn
+ // a global address into it. If it is a global address it will be constant
+ // extended. We do this for COMBINE.
+ // We currently only handle isGlobal() because it is the only kind of
+ // object we are going to end up with here for now.
+ // In the future we probably should add isSymbol(), etc.
+ if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
+ MO.isJTI() || MO.isCPI())
+ return true;
- return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+ // If the extendable operand is not 'Immediate' type, the instruction should
+ // have 'isExtended' flag set.
+ assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+ int MinValue = getMinValue(MI);
+ int MaxValue = getMaxValue(MI);
+ int ImmValue = MO.getImm();
+
+ return (ImmValue < MinValue || ImmValue > MaxValue);
}
-bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::L4_return :
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ case Hexagon::L4_return_fnew_pt :
+ return true;
+ }
+ return false;
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+bool HexagonInstrInfo::isDependent(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const {
+ const MCInstrDesc &ProdMCID = ProdMI->getDesc();
+ if (!ProdMCID.getNumDefs())
+ return false;
+
+ auto &HRI = getRegisterInfo();
+
+ SmallVector<unsigned, 4> DefsA;
+ SmallVector<unsigned, 4> DefsB;
+ SmallVector<unsigned, 8> UsesA;
+ SmallVector<unsigned, 8> UsesB;
+
+ parseOperands(ProdMI, DefsA, UsesA);
+ parseOperands(ConsMI, DefsB, UsesB);
+
+ for (auto &RegA : DefsA)
+ for (auto &RegB : UsesB) {
+ // True data dependency.
+ if (RegA == RegB)
+ return true;
+
+ if (Hexagon::DoubleRegsRegClass.contains(RegA))
+ for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
+ if (RegB == *SubRegs)
+ return true;
+
+ if (Hexagon::DoubleRegsRegClass.contains(RegB))
+ for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
+ if (RegA == *SubRegs)
+ return true;
+ }
+
+ return false;
+}
+
+
+// Returns true if the instruction is alread a .cur.
+bool HexagonInstrInfo::isDotCurInst(const MachineInstr* MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::V6_vL32b_cur_pi:
+ case Hexagon::V6_vL32b_cur_ai:
+ case Hexagon::V6_vL32b_cur_pi_128B:
+ case Hexagon::V6_vL32b_cur_ai_128B:
+ return true;
+ }
+ return false;
+}
+
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst(const MachineInstr* MI) const {
+ if (isNewValueInst(MI) ||
+ (isPredicated(MI) && isPredicatedNew(MI)))
+ return true;
+
+ return false;
+}
+
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonInstrInfo::isDuplexPair(const MachineInstr *MIa,
+ const MachineInstr *MIb) const {
+ HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa);
+ HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb);
+ return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+
+bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
+
+ if (MI->mayLoad() || MI->mayStore() || MI->isCompare())
+ return true;
+
+ // Multiply
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+ if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23)
+ return true;
+ return false;
+}
+
+
+bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
+ return (Opcode == Hexagon::ENDLOOP0 ||
+ Opcode == Hexagon::ENDLOOP1);
+}
+
+
+bool HexagonInstrInfo::isExpr(unsigned OpType) const {
+ switch(OpType) {
+ case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_BlockAddress:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+ const MCInstrDesc &MID = MI->getDesc();
+ const uint64_t F = MID.TSFlags;
+ if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+ return true;
+
+ // TODO: This is largely obsolete now. Will need to be removed
+ // in consecutive patches.
+ switch(MI->getOpcode()) {
+ // TFR_FI Remains a special case.
+ case Hexagon::TFR_FI:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+ // First check if this is permanently extended op code.
+ const uint64_t F = MI->getDesc().TSFlags;
+ if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+ return true;
+ // Use MO operand flags to determine if one of MI's operands
+ // has HMOTF_ConstExtended flag set.
+ for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+ E = MI->operands_end(); I != E; ++I) {
+ if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonInstrInfo::isFloat(const MachineInstr *MI) const {
+ unsigned Opcode = MI->getOpcode();
const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::FPPos) & HexagonII::FPMask;
+}
- return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+
+// No V60 HVX VMEM with A_INDIRECT.
+bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr *I,
+ const MachineInstr *J) const {
+ if (!isV60VectorInstruction(I))
+ return false;
+ if (!I->mayLoad() && !I->mayStore())
+ return false;
+ return J->isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J);
}
-int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
- enum Hexagon::PredSense inPredSense;
- inPredSense = invertPredicate ? Hexagon::PredSense_false :
- Hexagon::PredSense_true;
- int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
- if (CondOpcode >= 0) // Valid Conditional opcode/instruction
- return CondOpcode;
- // This switch case will be removed once all the instructions have been
- // modified to use relation maps.
- switch(Opc) {
- case Hexagon::TFRI_f:
- return !invertPredicate ? Hexagon::TFRI_cPt_f :
- Hexagon::TFRI_cNotPt_f;
- case Hexagon::A2_combinew:
- return !invertPredicate ? Hexagon::C2_ccombinewt :
- Hexagon::C2_ccombinewf;
+bool HexagonInstrInfo::isIndirectCall(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::J2_callr :
+ case Hexagon::J2_callrf :
+ case Hexagon::J2_callrt :
+ return true;
+ }
+ return false;
+}
- // DEALLOC_RETURN.
- case Hexagon::L4_return:
- return !invertPredicate ? Hexagon::L4_return_t:
- Hexagon::L4_return_f;
+
+bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::L4_return :
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_fnew_pt :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ return true;
}
- llvm_unreachable("Unexpected predicable instruction");
+ return false;
}
-bool HexagonInstrInfo::
-PredicateInstruction(MachineInstr *MI,
- ArrayRef<MachineOperand> Cond) const {
- if (Cond.empty() || isEndLoopN(Cond[0].getImm())) {
- DEBUG(dbgs() << "\nCannot predicate:"; MI->dump(););
+bool HexagonInstrInfo::isJumpR(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::J2_jumpr :
+ case Hexagon::J2_jumprt :
+ case Hexagon::J2_jumprf :
+ case Hexagon::J2_jumprtnewpt :
+ case Hexagon::J2_jumprfnewpt :
+ case Hexagon::J2_jumprtnew :
+ case Hexagon::J2_jumprfnew :
+ return true;
+ }
+ return false;
+}
+
+
+// Return true if a given MI can accomodate given offset.
+// Use abs estimate as oppose to the exact number.
+// TODO: This will need to be changed to use MC level
+// definition of instruction extendable field size.
+bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr *MI,
+ unsigned offset) const {
+ // This selection of jump instructions matches to that what
+ // AnalyzeBranch can parse, plus NVJ.
+ if (isNewValueJump(MI)) // r9:2
+ return isInt<11>(offset);
+
+ switch (MI->getOpcode()) {
+ // Still missing Jump to address condition on register value.
+ default:
return false;
+ case Hexagon::J2_jump: // bits<24> dst; // r22:2
+ case Hexagon::J2_call:
+ case Hexagon::CALLv3nr:
+ return isInt<24>(offset);
+ case Hexagon::J2_jumpt: //bits<17> dst; // r15:2
+ case Hexagon::J2_jumpf:
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumpfnewpt:
+ case Hexagon::J2_callt:
+ case Hexagon::J2_callf:
+ return isInt<17>(offset);
+ case Hexagon::J2_loop0i:
+ case Hexagon::J2_loop0iext:
+ case Hexagon::J2_loop0r:
+ case Hexagon::J2_loop0rext:
+ case Hexagon::J2_loop1i:
+ case Hexagon::J2_loop1iext:
+ case Hexagon::J2_loop1r:
+ case Hexagon::J2_loop1rext:
+ return isInt<9>(offset);
+ // TODO: Add all the compound branches here. Can we do this in Relation model?
+ case Hexagon::J4_cmpeqi_tp0_jump_nt:
+ case Hexagon::J4_cmpeqi_tp1_jump_nt:
+ return isInt<11>(offset);
}
- int Opc = MI->getOpcode();
- assert (isPredicable(MI) && "Expected predicable instruction");
- bool invertJump = predOpcodeHasNot(Cond);
+}
- // We have to predicate MI "in place", i.e. after this function returns,
- // MI will need to be transformed into a predicated form. To avoid com-
- // plicated manipulations with the operands (handling tied operands,
- // etc.), build a new temporary instruction, then overwrite MI with it.
- MachineBasicBlock &B = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
- unsigned PredOpc = getCondOpcode(Opc, invertJump);
- MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
- unsigned NOp = 0, NumOps = MI->getNumOperands();
- while (NOp < NumOps) {
- MachineOperand &Op = MI->getOperand(NOp);
- if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
- break;
- T.addOperand(Op);
- NOp++;
+bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI,
+ const MachineInstr *ESMI) const {
+ if (!LRMI || !ESMI)
+ return false;
+
+ bool isLate = isLateResultInstr(LRMI);
+ bool isEarly = isEarlySourceInstr(ESMI);
+
+ DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- "));
+ DEBUG(LRMI->dump());
+ DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- "));
+ DEBUG(ESMI->dump());
+
+ if (isLate && isEarly) {
+ DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+ return true;
}
- unsigned PredReg, PredRegPos, PredRegFlags;
- bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
- (void)GotPredReg;
- assert(GotPredReg);
- T.addReg(PredReg, PredRegFlags);
- while (NOp < NumOps)
- T.addOperand(MI->getOperand(NOp++));
+ return false;
+}
- MI->setDesc(get(PredOpc));
- while (unsigned n = MI->getNumOperands())
- MI->RemoveOperand(n-1);
- for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
- MI->addOperand(T->getOperand(i));
- MachineBasicBlock::instr_iterator TI = &*T;
- B.erase(TI);
+bool HexagonInstrInfo::isLateResultInstr(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
- MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
- MRI.clearKillFlags(PredReg);
+ switch (MI->getOpcode()) {
+ case TargetOpcode::EXTRACT_SUBREG:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ case TargetOpcode::REG_SEQUENCE:
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::COPY:
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::PHI:
+ return false;
+ default:
+ break;
+ }
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_1_SLOT23:
+ case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+ case Hexagon::Sched::S_2op_tc_1_SLOT23:
+ case Hexagon::Sched::S_3op_tc_1_SLOT23:
+ case Hexagon::Sched::V2LDST_tc_ld_SLOT01:
+ case Hexagon::Sched::V2LDST_tc_st_SLOT0:
+ case Hexagon::Sched::V2LDST_tc_st_SLOT01:
+ case Hexagon::Sched::V4LDST_tc_ld_SLOT01:
+ case Hexagon::Sched::V4LDST_tc_st_SLOT0:
+ case Hexagon::Sched::V4LDST_tc_st_SLOT01:
+ return false;
+ }
return true;
}
-bool
-HexagonInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &MBB,
- unsigned NumCycles,
- unsigned ExtraPredCycles,
- const BranchProbability &Probability) const {
- return true;
+bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
+
+ // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
+ // resource, but all operands can be received late like an ALU instruction.
+ return MI->getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
}
-bool
-HexagonInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB,
- unsigned NumTCycles,
- unsigned ExtraTCycles,
- MachineBasicBlock &FMBB,
- unsigned NumFCycles,
- unsigned ExtraFCycles,
- const BranchProbability &Probability) const {
- return true;
+bool HexagonInstrInfo::isLoopN(const MachineInstr *MI) const {
+ unsigned Opcode = MI->getOpcode();
+ return Opcode == Hexagon::J2_loop0i ||
+ Opcode == Hexagon::J2_loop0r ||
+ Opcode == Hexagon::J2_loop0iext ||
+ Opcode == Hexagon::J2_loop0rext ||
+ Opcode == Hexagon::J2_loop1i ||
+ Opcode == Hexagon::J2_loop1r ||
+ Opcode == Hexagon::J2_loop1iext ||
+ Opcode == Hexagon::J2_loop1rext;
+}
+
+
+bool HexagonInstrInfo::isMemOp(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case Hexagon::L4_iadd_memopw_io :
+ case Hexagon::L4_isub_memopw_io :
+ case Hexagon::L4_add_memopw_io :
+ case Hexagon::L4_sub_memopw_io :
+ case Hexagon::L4_and_memopw_io :
+ case Hexagon::L4_or_memopw_io :
+ case Hexagon::L4_iadd_memoph_io :
+ case Hexagon::L4_isub_memoph_io :
+ case Hexagon::L4_add_memoph_io :
+ case Hexagon::L4_sub_memoph_io :
+ case Hexagon::L4_and_memoph_io :
+ case Hexagon::L4_or_memoph_io :
+ case Hexagon::L4_iadd_memopb_io :
+ case Hexagon::L4_isub_memopb_io :
+ case Hexagon::L4_add_memopb_io :
+ case Hexagon::L4_sub_memopb_io :
+ case Hexagon::L4_and_memopb_io :
+ case Hexagon::L4_or_memopb_io :
+ case Hexagon::L4_ior_memopb_io:
+ case Hexagon::L4_ior_memoph_io:
+ case Hexagon::L4_ior_memopw_io:
+ case Hexagon::L4_iand_memopb_io:
+ case Hexagon::L4_iand_memoph_io:
+ case Hexagon::L4_iand_memopw_io:
+ return true;
+ }
+ return false;
}
-// Returns true if an instruction is predicated irrespective of the predicate
-// sense. For example, all of the following will return true.
-// if (p0) R1 = add(R2, R3)
-// if (!p0) R1 = add(R2, R3)
-// if (p0.new) R1 = add(R2, R3)
-// if (!p0.new) R1 = add(R2, R3)
-bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
- return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
}
-bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isNewValue(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
- return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
+ return isNewValueJump(MI) || isNewValueStore(MI);
}
-bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
- assert(isPredicated(MI));
- return (!((F >> HexagonII::PredicatedFalsePos) &
- HexagonII::PredicatedFalseMask));
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+ return isNewValue(MI) && MI->isBranch();
}
-bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+
+bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const {
+ return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
- // Make sure that the instruction is predicated.
- assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
- return (!((F >> HexagonII::PredicatedFalsePos) &
- HexagonII::PredicatedFalseMask));
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
+ unsigned OperandNum) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+ == OperandNum;
}
+
+bool HexagonInstrInfo::isPostIncrement(const MachineInstr* MI) const {
+ return getAddrMode(MI) == HexagonII::PostInc;
+}
+
+
bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
const uint64_t F = MI->getDesc().TSFlags;
-
assert(isPredicated(MI));
- return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+ return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
}
+
bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
-
assert(isPredicated(Opcode));
- return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+ return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
}
-// Returns true, if a ST insn can be promoted to a new-value store.
-bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
const uint64_t F = MI->getDesc().TSFlags;
+ return !((F >> HexagonII::PredicatedFalsePos) &
+ HexagonII::PredicatedFalseMask);
+}
+
- return ((F >> HexagonII::mayNVStorePos) &
- HexagonII::mayNVStoreMask);
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ // Make sure that the instruction is predicated.
+ assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+ return !((F >> HexagonII::PredicatedFalsePos) &
+ HexagonII::PredicatedFalseMask);
}
-bool
-HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
- std::vector<MachineOperand> &Pred) const {
- for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
- MachineOperand MO = MI->getOperand(oper);
- if (MO.isReg() && MO.isDef()) {
- const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
- if (RC == &Hexagon::PredRegsRegClass) {
- Pred.push_back(MO);
- return true;
- }
- }
+
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
+}
+
+
+bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ assert(get(Opcode).isBranch() &&
+ (isPredicatedNew(Opcode) || isNewValue(Opcode)));
+ return (F >> HexagonII::TakenPos) & HexagonII::TakenMask;
+}
+
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+ return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
+ MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT;
+}
+
+
+bool HexagonInstrInfo::isSolo(const MachineInstr* MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::SoloPos) & HexagonII::SoloMask;
+}
+
+
+bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case Hexagon::STriw_pred :
+ case Hexagon::LDriw_pred :
+ return true;
+ default:
+ return false;
}
- return false;
}
-bool
-HexagonInstrInfo::
-SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
- ArrayRef<MachineOperand> Pred2) const {
- // TODO: Fix this
- return false;
+// Returns true when SU has a timing class TC1.
+bool HexagonInstrInfo::isTC1(const MachineInstr *MI) const {
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_1_SLOT23:
+ case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+ //case Hexagon::Sched::M_tc_1_SLOT23:
+ case Hexagon::Sched::S_2op_tc_1_SLOT23:
+ case Hexagon::Sched::S_3op_tc_1_SLOT23:
+ return true;
+
+ default:
+ return false;
+ }
}
-//
-// We indicate that we want to reverse the branch by
-// inserting the reversed branching opcode.
-//
-bool HexagonInstrInfo::ReverseBranchCondition(
- SmallVectorImpl<MachineOperand> &Cond) const {
- if (Cond.empty())
+bool HexagonInstrInfo::isTC2(const MachineInstr *MI) const {
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_2_SLOT23:
+ case Hexagon::Sched::CR_tc_2_SLOT3:
+ case Hexagon::Sched::M_tc_2_SLOT23:
+ case Hexagon::Sched::S_2op_tc_2_SLOT23:
+ case Hexagon::Sched::S_3op_tc_2_SLOT23:
return true;
- assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
- Opcode_t opcode = Cond[0].getImm();
- //unsigned temp;
- assert(get(opcode).isBranch() && "Should be a branching condition.");
- if (isEndLoopN(opcode))
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTC2Early(const MachineInstr *MI) const {
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+ switch (SchedClass) {
+ case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123:
+ case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123:
+ case Hexagon::Sched::ALU64_tc_2early_SLOT23:
+ case Hexagon::Sched::CR_tc_2early_SLOT23:
+ case Hexagon::Sched::CR_tc_2early_SLOT3:
+ case Hexagon::Sched::J_tc_2early_SLOT0123:
+ case Hexagon::Sched::J_tc_2early_SLOT2:
+ case Hexagon::Sched::J_tc_2early_SLOT23:
+ case Hexagon::Sched::S_2op_tc_2early_SLOT23:
+ case Hexagon::Sched::S_3op_tc_2early_SLOT23:
return true;
- Opcode_t NewOpcode = getInvertedPredicatedOpcode(opcode);
- Cond[0].setImm(NewOpcode);
- return false;
+
+ default:
+ return false;
+ }
+}
+
+
+bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
+
+ unsigned SchedClass = MI->getDesc().getSchedClass();
+ return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
}
-bool HexagonInstrInfo::
-isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
- const BranchProbability &Probability) const {
- return (NumInstrs <= 4);
+bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
+
+ const uint64_t V = getType(MI);
+ return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
}
-bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- default: return false;
- case Hexagon::L4_return:
- case Hexagon::L4_return_t:
- case Hexagon::L4_return_f:
- case Hexagon::L4_return_tnew_pnt:
- case Hexagon::L4_return_fnew_pnt:
- case Hexagon::L4_return_tnew_pt:
- case Hexagon::L4_return_fnew_pt:
- return true;
+
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
+ if (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+ VT == MVT::v32i16 || VT == MVT::v64i8) {
+ return (Offset >= Hexagon_MEMV_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMV_AUTOINC_MAX &&
+ (Offset & 0x3f) == 0);
+ }
+ // 128B
+ if (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+ VT == MVT::v64i16 || VT == MVT::v128i8) {
+ return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B &&
+ Offset <= Hexagon_MEMV_AUTOINC_MAX_128B &&
+ (Offset & 0x7f) == 0);
+ }
+ if (VT == MVT::i64) {
+ return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+ (Offset & 0x7) == 0);
+ }
+ if (VT == MVT::i32) {
+ return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+ (Offset & 0x3) == 0);
+ }
+ if (VT == MVT::i16) {
+ return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+ (Offset & 0x1) == 0);
+ }
+ if (VT == MVT::i8) {
+ return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+ Offset <= Hexagon_MEMB_AUTOINC_MAX);
}
+ llvm_unreachable("Not an auto-inc opc!");
}
@@ -1222,6 +2329,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
// misaligns with respect to load size.
switch (Opcode) {
+ case Hexagon::STriq_pred_V6:
+ case Hexagon::STriq_pred_vec_V6:
+ case Hexagon::STriv_pseudo_V6:
+ case Hexagon::STrivv_pseudo_V6:
+ case Hexagon::LDriq_pred_V6:
+ case Hexagon::LDriq_pred_vec_V6:
+ case Hexagon::LDriv_pseudo_V6:
+ case Hexagon::LDrivv_pseudo_V6:
+ case Hexagon::LDrivv_indexed:
+ case Hexagon::STrivv_indexed:
+ case Hexagon::V6_vL32b_ai:
+ case Hexagon::V6_vS32b_ai:
+ case Hexagon::V6_vL32Ub_ai:
+ case Hexagon::V6_vS32Ub_ai:
+ return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
+ (Offset <= Hexagon_MEMV_OFFSET_MAX);
+
+ case Hexagon::STriq_pred_V6_128B:
+ case Hexagon::STriq_pred_vec_V6_128B:
+ case Hexagon::STriv_pseudo_V6_128B:
+ case Hexagon::STrivv_pseudo_V6_128B:
+ case Hexagon::LDriq_pred_V6_128B:
+ case Hexagon::LDriq_pred_vec_V6_128B:
+ case Hexagon::LDriv_pseudo_V6_128B:
+ case Hexagon::LDrivv_pseudo_V6_128B:
+ case Hexagon::LDrivv_indexed_128B:
+ case Hexagon::STrivv_indexed_128B:
+ case Hexagon::V6_vL32b_ai_128B:
+ case Hexagon::V6_vS32b_ai_128B:
+ case Hexagon::V6_vL32Ub_ai_128B:
+ case Hexagon::V6_vS32Ub_ai_128B:
+ return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
+ (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+
case Hexagon::J2_loop0i:
case Hexagon::J2_loop1i:
return isUInt<10>(Offset);
@@ -1248,8 +2389,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
(Offset <= Hexagon_MEMH_OFFSET_MAX);
case Hexagon::L2_loadrb_io:
- case Hexagon::S2_storerb_io:
case Hexagon::L2_loadrub_io:
+ case Hexagon::S2_storerb_io:
return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
(Offset <= Hexagon_MEMB_OFFSET_MAX);
@@ -1257,28 +2398,28 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
(Offset <= Hexagon_ADDI_OFFSET_MAX);
- case Hexagon::L4_iadd_memopw_io:
- case Hexagon::L4_isub_memopw_io:
- case Hexagon::L4_add_memopw_io:
- case Hexagon::L4_sub_memopw_io:
- case Hexagon::L4_and_memopw_io:
- case Hexagon::L4_or_memopw_io:
+ case Hexagon::L4_iadd_memopw_io :
+ case Hexagon::L4_isub_memopw_io :
+ case Hexagon::L4_add_memopw_io :
+ case Hexagon::L4_sub_memopw_io :
+ case Hexagon::L4_and_memopw_io :
+ case Hexagon::L4_or_memopw_io :
return (0 <= Offset && Offset <= 255);
- case Hexagon::L4_iadd_memoph_io:
- case Hexagon::L4_isub_memoph_io:
- case Hexagon::L4_add_memoph_io:
- case Hexagon::L4_sub_memoph_io:
- case Hexagon::L4_and_memoph_io:
- case Hexagon::L4_or_memoph_io:
+ case Hexagon::L4_iadd_memoph_io :
+ case Hexagon::L4_isub_memoph_io :
+ case Hexagon::L4_add_memoph_io :
+ case Hexagon::L4_sub_memoph_io :
+ case Hexagon::L4_and_memoph_io :
+ case Hexagon::L4_or_memoph_io :
return (0 <= Offset && Offset <= 127);
- case Hexagon::L4_iadd_memopb_io:
- case Hexagon::L4_isub_memopb_io:
- case Hexagon::L4_add_memopb_io:
- case Hexagon::L4_sub_memopb_io:
- case Hexagon::L4_and_memopb_io:
- case Hexagon::L4_or_memopb_io:
+ case Hexagon::L4_iadd_memopb_io :
+ case Hexagon::L4_isub_memopb_io :
+ case Hexagon::L4_add_memopb_io :
+ case Hexagon::L4_sub_memopb_io :
+ case Hexagon::L4_and_memopb_io :
+ case Hexagon::L4_or_memopb_io :
return (0 <= Offset && Offset <= 63);
// LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
@@ -1291,223 +2432,556 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::TFR_FIA:
case Hexagon::INLINEASM:
return true;
- }
+
+ case Hexagon::L2_ploadrbt_io:
+ case Hexagon::L2_ploadrbf_io:
+ case Hexagon::L2_ploadrubt_io:
+ case Hexagon::L2_ploadrubf_io:
+ case Hexagon::S2_pstorerbt_io:
+ case Hexagon::S2_pstorerbf_io:
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirbt_io:
+ case Hexagon::S4_storeirbf_io:
+ return isUInt<6>(Offset);
+
+ case Hexagon::L2_ploadrht_io:
+ case Hexagon::L2_ploadrhf_io:
+ case Hexagon::L2_ploadruht_io:
+ case Hexagon::L2_ploadruhf_io:
+ case Hexagon::S2_pstorerht_io:
+ case Hexagon::S2_pstorerhf_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeirht_io:
+ case Hexagon::S4_storeirhf_io:
+ return isShiftedUInt<6,1>(Offset);
+
+ case Hexagon::L2_ploadrit_io:
+ case Hexagon::L2_ploadrif_io:
+ case Hexagon::S2_pstorerit_io:
+ case Hexagon::S2_pstorerif_io:
+ case Hexagon::S4_storeiri_io:
+ case Hexagon::S4_storeirit_io:
+ case Hexagon::S4_storeirif_io:
+ return isShiftedUInt<6,2>(Offset);
+
+ case Hexagon::L2_ploadrdt_io:
+ case Hexagon::L2_ploadrdf_io:
+ case Hexagon::S2_pstorerdt_io:
+ case Hexagon::S2_pstorerdf_io:
+ return isShiftedUInt<6,3>(Offset);
+ } // switch
llvm_unreachable("No offset range is defined for this opcode. "
"Please define it in the above switch statement!");
}
-//
-// Check if the Offset is a valid auto-inc imm by Load/Store Type.
-//
-bool HexagonInstrInfo::
-isValidAutoIncImm(const EVT VT, const int Offset) const {
+bool HexagonInstrInfo::isVecAcc(const MachineInstr *MI) const {
+ return MI && isV60VectorInstruction(MI) && isAccumulator(MI);
+}
- if (VT == MVT::i64) {
- return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
- Offset <= Hexagon_MEMD_AUTOINC_MAX &&
- (Offset & 0x7) == 0);
- }
- if (VT == MVT::i32) {
- return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
- Offset <= Hexagon_MEMW_AUTOINC_MAX &&
- (Offset & 0x3) == 0);
- }
- if (VT == MVT::i16) {
- return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
- Offset <= Hexagon_MEMH_AUTOINC_MAX &&
- (Offset & 0x1) == 0);
- }
- if (VT == MVT::i8) {
- return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
- Offset <= Hexagon_MEMB_AUTOINC_MAX);
+
+bool HexagonInstrInfo::isVecALU(const MachineInstr *MI) const {
+ if (!MI)
+ return false;
+ const uint64_t F = get(MI->getOpcode()).TSFlags;
+ const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+ return
+ V == HexagonII::TypeCVI_VA ||
+ V == HexagonII::TypeCVI_VA_DV;
+}
+
+
+bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const {
+ if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI))
+ return true;
+
+ if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI)))
+ return true;
+
+ if (mayBeNewStore(ConsMI))
+ return true;
+
+ return false;
+}
+
+
+/// \brief Can these instructions execute at the same time in a bundle.
+bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
+ const MachineInstr *Second) const {
+ if (DisableNVSchedule)
+ return false;
+ if (mayBeNewStore(Second)) {
+ // Make sure the definition of the first instruction is the value being
+ // stored.
+ const MachineOperand &Stored =
+ Second->getOperand(Second->getNumOperands() - 1);
+ if (!Stored.isReg())
+ return false;
+ for (unsigned i = 0, e = First->getNumOperands(); i < e; ++i) {
+ const MachineOperand &Op = First->getOperand(i);
+ if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg())
+ return true;
+ }
}
- llvm_unreachable("Not an auto-inc opc!");
+ return false;
+}
+
+
+bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
+ for (auto &I : *B)
+ if (I.isEHLabel())
+ return true;
+ return false;
}
-bool HexagonInstrInfo::
-isMemOp(const MachineInstr *MI) const {
-// return MI->getDesc().mayLoad() && MI->getDesc().mayStore();
-
- switch (MI->getOpcode())
- {
- default: return false;
- case Hexagon::L4_iadd_memopw_io:
- case Hexagon::L4_isub_memopw_io:
- case Hexagon::L4_add_memopw_io:
- case Hexagon::L4_sub_memopw_io:
- case Hexagon::L4_and_memopw_io:
- case Hexagon::L4_or_memopw_io:
- case Hexagon::L4_iadd_memoph_io:
- case Hexagon::L4_isub_memoph_io:
- case Hexagon::L4_add_memoph_io:
- case Hexagon::L4_sub_memoph_io:
- case Hexagon::L4_and_memoph_io:
- case Hexagon::L4_or_memoph_io:
- case Hexagon::L4_iadd_memopb_io:
- case Hexagon::L4_isub_memopb_io:
- case Hexagon::L4_add_memopb_io:
- case Hexagon::L4_sub_memopb_io:
- case Hexagon::L4_and_memopb_io:
- case Hexagon::L4_or_memopb_io:
- case Hexagon::L4_ior_memopb_io:
- case Hexagon::L4_ior_memoph_io:
- case Hexagon::L4_ior_memopw_io:
- case Hexagon::L4_iand_memopb_io:
- case Hexagon::L4_iand_memoph_io:
- case Hexagon::L4_iand_memopw_io:
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr *MI) const {
+ short NonExtOpcode;
+ // Check if the instruction has a register form that uses register in place
+ // of the extended operand, if so return that as the non-extended form.
+ if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
+ return true;
+
+ if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+ // Check addressing mode and retrieve non-ext equivalent instruction.
+
+ switch (getAddrMode(MI)) {
+ case HexagonII::Absolute :
+ // Load/store with absolute addressing mode can be converted into
+ // base+offset mode.
+ NonExtOpcode = Hexagon::getBaseWithImmOffset(MI->getOpcode());
+ break;
+ case HexagonII::BaseImmOffset :
+ // Load/store with base+offset addressing mode can be converted into
+ // base+register offset addressing mode. However left shift operand should
+ // be set to 0.
+ NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
+ break;
+ case HexagonII::BaseLongOffset:
+ NonExtOpcode = Hexagon::getRegShlForm(MI->getOpcode());
+ break;
+ default:
+ return false;
+ }
+ if (NonExtOpcode < 0)
+ return false;
return true;
}
return false;
}
-bool HexagonInstrInfo::
-isSpillPredRegOp(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- default: return false;
- case Hexagon::STriw_pred :
- case Hexagon::LDriw_pred :
+bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr *MI) const {
+ return Hexagon::getRealHWInstr(MI->getOpcode(),
+ Hexagon::InstrType_Pseudo) >= 0;
+}
+
+
+bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
+ const {
+ MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+ while (I != E) {
+ if (I->isBarrier())
return true;
+ ++I;
}
+ return false;
}
-bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- default: return false;
- case Hexagon::C2_cmpeq:
- case Hexagon::C2_cmpeqi:
- case Hexagon::C2_cmpgt:
- case Hexagon::C2_cmpgti:
- case Hexagon::C2_cmpgtu:
- case Hexagon::C2_cmpgtui:
+
+// Returns true, if a LD insn can be promoted to a cur load.
+bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr *MI) const {
+ auto &HST = MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>();
+ const uint64_t F = MI->getDesc().TSFlags;
+ return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
+ HST.hasV60TOps();
+}
+
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const {
+ // There is no stall when ProdMI is not a V60 vector.
+ if (!isV60VectorInstruction(ProdMI))
+ return false;
+
+ // There is no stall when ProdMI and ConsMI are not dependent.
+ if (!isDependent(ProdMI, ConsMI))
+ return false;
+
+ // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI
+ // are scheduled in consecutive packets.
+ if (isVecUsableNextPacket(ProdMI, ConsMI))
+ return false;
+
+ return true;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr *MI,
+ MachineBasicBlock::const_instr_iterator BII) const {
+ // There is no stall when I is not a V60 vector.
+ if (!isV60VectorInstruction(MI))
+ return false;
+
+ MachineBasicBlock::const_instr_iterator MII = BII;
+ MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end();
+
+ if (!(*MII).isBundle()) {
+ const MachineInstr *J = &*MII;
+ if (!isV60VectorInstruction(J))
+ return false;
+ else if (isVecUsableNextPacket(J, MI))
+ return false;
+ return true;
+ }
+
+ for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) {
+ const MachineInstr *J = &*MII;
+ if (producesStall(J, MI))
return true;
}
+ return false;
+}
+
+
+bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr *MI,
+ unsigned PredReg) const {
+ for (unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
+ return false; // Predicate register must be explicitly defined.
+ }
+
+ // Hexagon Programmer's Reference says that decbin, memw_locked, and
+ // memd_locked cannot be used as .new as well,
+ // but we don't seem to have these instructions defined.
+ return MI->getOpcode() != Hexagon::A4_tlbmatch;
+}
+
+
+bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
+ return (Opcode == Hexagon::J2_jumpt) ||
+ (Opcode == Hexagon::J2_jumpf) ||
+ (Opcode == Hexagon::J2_jumptnew) ||
+ (Opcode == Hexagon::J2_jumpfnew) ||
+ (Opcode == Hexagon::J2_jumptnewpt) ||
+ (Opcode == Hexagon::J2_jumpfnewpt);
+}
+
+
+bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
+ if (Cond.empty() || !isPredicated(Cond[0].getImm()))
+ return false;
+ return !isPredicatedTrue(Cond[0].getImm());
+}
+
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
+}
+
+
+// Returns the base register in a memory access (load/store). The offset is
+// returned in Offset and the access size is returned in AccessSize.
+unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr *MI,
+ int &Offset, unsigned &AccessSize) const {
+ // Return if it is not a base+offset type instruction or a MemOp.
+ if (getAddrMode(MI) != HexagonII::BaseImmOffset &&
+ getAddrMode(MI) != HexagonII::BaseLongOffset &&
+ !isMemOp(MI) && !isPostIncrement(MI))
+ return 0;
+
+ // Since it is a memory access instruction, getMemAccessSize() should never
+ // return 0.
+ assert (getMemAccessSize(MI) &&
+ "BaseImmOffset or BaseLongOffset or MemOp without accessSize");
+
+ // Return Values of getMemAccessSize() are
+ // 0 - Checked in the assert above.
+ // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these.
+ // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+ AccessSize = (1U << (getMemAccessSize(MI) - 1));
+
+ unsigned basePos = 0, offsetPos = 0;
+ if (!getBaseAndOffsetPosition(MI, basePos, offsetPos))
+ return 0;
+
+ // Post increment updates its EA after the mem access,
+ // so we need to treat its offset as zero.
+ if (isPostIncrement(MI))
+ Offset = 0;
+ else {
+ Offset = MI->getOperand(offsetPos).getImm();
+ }
+
+ return MI->getOperand(basePos).getReg();
+}
+
+
+/// Return the position of the base and offset operands for this instruction.
+bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI,
+ unsigned &BasePos, unsigned &OffsetPos) const {
+ // Deal with memops first.
+ if (isMemOp(MI)) {
+ assert (MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+ "Bad Memop.");
+ BasePos = 0;
+ OffsetPos = 1;
+ } else if (MI->mayStore()) {
+ BasePos = 0;
+ OffsetPos = 1;
+ } else if (MI->mayLoad()) {
+ BasePos = 1;
+ OffsetPos = 2;
+ } else
+ return false;
+
+ if (isPredicated(MI)) {
+ BasePos++;
+ OffsetPos++;
+ }
+ if (isPostIncrement(MI)) {
+ BasePos++;
+ OffsetPos++;
+ }
+
+ if (!MI->getOperand(BasePos).isReg() || !MI->getOperand(OffsetPos).isImm())
+ return false;
+
+ return true;
+}
+
+
+// Inserts branching instructions in reverse order of their occurence.
+// e.g. jump_t t1 (i1)
+// jump t2 (i2)
+// Jumpers = {i2, i1}
+SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
+ MachineBasicBlock& MBB) const {
+ SmallVector<MachineInstr*, 2> Jumpers;
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ if (I == MBB.instr_begin())
+ return Jumpers;
+
+ // A basic block may looks like this:
+ //
+ // [ insn
+ // EH_LABEL
+ // insn
+ // insn
+ // insn
+ // EH_LABEL
+ // insn ]
+ //
+ // It has two succs but does not have a terminator
+ // Don't know how to handle it.
+ do {
+ --I;
+ if (I->isEHLabel())
+ return Jumpers;
+ } while (I != MBB.instr_begin());
+
+ I = MBB.instr_end();
+ --I;
+
+ while (I->isDebugValue()) {
+ if (I == MBB.instr_begin())
+ return Jumpers;
+ --I;
+ }
+ if (!isUnpredicatedTerminator(&*I))
+ return Jumpers;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+ Jumpers.push_back(LastInst);
+ MachineInstr *SecondLastInst = nullptr;
+ // Find one more terminator if present.
+ do {
+ if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) {
+ if (!SecondLastInst) {
+ SecondLastInst = &*I;
+ Jumpers.push_back(SecondLastInst);
+ } else // This is a third branch.
+ return Jumpers;
+ }
+ if (I == MBB.instr_begin())
+ break;
+ --I;
+ } while (true);
+ return Jumpers;
}
-bool HexagonInstrInfo::
-isConditionalTransfer (const MachineInstr *MI) const {
+
+// Returns Operand Index for the constant extended instruction.
+unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask;
+}
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup(
+ const MachineInstr *MI) const {
+ unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
switch (MI->getOpcode()) {
- default: return false;
- case Hexagon::A2_tfrt:
- case Hexagon::A2_tfrf:
- case Hexagon::C2_cmoveit:
- case Hexagon::C2_cmoveif:
- case Hexagon::A2_tfrtnew:
- case Hexagon::A2_tfrfnew:
- case Hexagon::C2_cmovenewit:
- case Hexagon::C2_cmovenewif:
- return true;
+ default:
+ return HexagonII::HCG_None;
+ //
+ // Compound pairs.
+ // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+ // "Rd16=#U6 ; jump #r9:2"
+ // "Rd16=Rs16 ; jump #r9:2"
+ //
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgtu:
+ DstReg = MI->getOperand(0).getReg();
+ Src1Reg = MI->getOperand(1).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtui:
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() &&
+ ((isUInt<5>(MI->getOperand(2).getImm())) ||
+ (MI->getOperand(2).getImm() == -1)))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfr:
+ // Rd = Rs
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::A2_tfrsi:
+ // Rd = #u6
+ // Do not test for #u6 size since the const is getting extended
+ // regardless and compound could be formed.
+ DstReg = MI->getOperand(0).getReg();
+ if (isIntRegForSubInst(DstReg))
+ return HexagonII::HCG_A;
+ break;
+ case Hexagon::S2_tstbit_i:
+ DstReg = MI->getOperand(0).getReg();
+ Src1Reg = MI->getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+ MI->getOperand(2).isImm() &&
+ isIntRegForSubInst(Src1Reg) && (MI->getOperand(2).getImm() == 0))
+ return HexagonII::HCG_A;
+ break;
+ // The fact that .new form is used pretty much guarantees
+ // that predicate register will match. Nevertheless,
+ // there could be some false positives without additional
+ // checking.
+ case Hexagon::J2_jumptnew:
+ case Hexagon::J2_jumpfnew:
+ case Hexagon::J2_jumptnewpt:
+ case Hexagon::J2_jumpfnewpt:
+ Src1Reg = MI->getOperand(0).getReg();
+ if (Hexagon::PredRegsRegClass.contains(Src1Reg) &&
+ (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg))
+ return HexagonII::HCG_B;
+ break;
+ // Transfer and jump:
+ // Rd=#U6 ; jump #r9:2
+ // Rd=Rs ; jump #r9:2
+ // Do not test for jump range here.
+ case Hexagon::J2_jump:
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ return HexagonII::HCG_C;
+ break;
}
+
+ return HexagonII::HCG_None;
+}
+
+
+// Returns -1 when there is no opcode found.
+unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr *GA,
+ const MachineInstr *GB) const {
+ assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A);
+ assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B);
+ if ((GA->getOpcode() != Hexagon::C2_cmpeqi) ||
+ (GB->getOpcode() != Hexagon::J2_jumptnew))
+ return -1;
+ unsigned DestReg = GA->getOperand(0).getReg();
+ if (!GB->readsRegister(DestReg))
+ return -1;
+ if (DestReg == Hexagon::P0)
+ return Hexagon::J4_cmpeqi_tp0_jump_nt;
+ if (DestReg == Hexagon::P1)
+ return Hexagon::J4_cmpeqi_tp1_jump_nt;
+ return -1;
}
-bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
- switch (MI->getOpcode())
- {
- default: return false;
- case Hexagon::A2_paddf:
- case Hexagon::A2_paddfnew:
- case Hexagon::A2_paddt:
- case Hexagon::A2_paddtnew:
- case Hexagon::A2_pandf:
- case Hexagon::A2_pandfnew:
- case Hexagon::A2_pandt:
- case Hexagon::A2_pandtnew:
- case Hexagon::A4_paslhf:
- case Hexagon::A4_paslhfnew:
- case Hexagon::A4_paslht:
- case Hexagon::A4_paslhtnew:
- case Hexagon::A4_pasrhf:
- case Hexagon::A4_pasrhfnew:
- case Hexagon::A4_pasrht:
- case Hexagon::A4_pasrhtnew:
- case Hexagon::A2_porf:
- case Hexagon::A2_porfnew:
- case Hexagon::A2_port:
- case Hexagon::A2_portnew:
- case Hexagon::A2_psubf:
- case Hexagon::A2_psubfnew:
- case Hexagon::A2_psubt:
- case Hexagon::A2_psubtnew:
- case Hexagon::A2_pxorf:
- case Hexagon::A2_pxorfnew:
- case Hexagon::A2_pxort:
- case Hexagon::A2_pxortnew:
- case Hexagon::A4_psxthf:
- case Hexagon::A4_psxthfnew:
- case Hexagon::A4_psxtht:
- case Hexagon::A4_psxthtnew:
- case Hexagon::A4_psxtbf:
- case Hexagon::A4_psxtbfnew:
- case Hexagon::A4_psxtbt:
- case Hexagon::A4_psxtbtnew:
- case Hexagon::A4_pzxtbf:
- case Hexagon::A4_pzxtbfnew:
- case Hexagon::A4_pzxtbt:
- case Hexagon::A4_pzxtbtnew:
- case Hexagon::A4_pzxthf:
- case Hexagon::A4_pzxthfnew:
- case Hexagon::A4_pzxtht:
- case Hexagon::A4_pzxthtnew:
- case Hexagon::A2_paddit:
- case Hexagon::A2_paddif:
- case Hexagon::C2_ccombinewt:
- case Hexagon::C2_ccombinewf:
- return true;
+
+int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
+ enum Hexagon::PredSense inPredSense;
+ inPredSense = invertPredicate ? Hexagon::PredSense_false :
+ Hexagon::PredSense_true;
+ int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+ if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+ return CondOpcode;
+
+ // This switch case will be removed once all the instructions have been
+ // modified to use relation maps.
+ switch(Opc) {
+ case Hexagon::TFRI_f:
+ return !invertPredicate ? Hexagon::TFRI_cPt_f :
+ Hexagon::TFRI_cNotPt_f;
}
+
+ llvm_unreachable("Unexpected predicable instruction");
}
-bool HexagonInstrInfo::
-isConditionalLoad (const MachineInstr* MI) const {
- switch (MI->getOpcode())
- {
- default: return false;
- case Hexagon::L2_ploadrdt_io :
- case Hexagon::L2_ploadrdf_io:
- case Hexagon::L2_ploadrit_io:
- case Hexagon::L2_ploadrif_io:
- case Hexagon::L2_ploadrht_io:
- case Hexagon::L2_ploadrhf_io:
- case Hexagon::L2_ploadrbt_io:
- case Hexagon::L2_ploadrbf_io:
- case Hexagon::L2_ploadruht_io:
- case Hexagon::L2_ploadruhf_io:
- case Hexagon::L2_ploadrubt_io:
- case Hexagon::L2_ploadrubf_io:
- case Hexagon::L2_ploadrdt_pi:
- case Hexagon::L2_ploadrdf_pi:
- case Hexagon::L2_ploadrit_pi:
- case Hexagon::L2_ploadrif_pi:
- case Hexagon::L2_ploadrht_pi:
- case Hexagon::L2_ploadrhf_pi:
- case Hexagon::L2_ploadrbt_pi:
- case Hexagon::L2_ploadrbf_pi:
- case Hexagon::L2_ploadruht_pi:
- case Hexagon::L2_ploadruhf_pi:
- case Hexagon::L2_ploadrubt_pi:
- case Hexagon::L2_ploadrubf_pi:
- case Hexagon::L4_ploadrdt_rr:
- case Hexagon::L4_ploadrdf_rr:
- case Hexagon::L4_ploadrbt_rr:
- case Hexagon::L4_ploadrbf_rr:
- case Hexagon::L4_ploadrubt_rr:
- case Hexagon::L4_ploadrubf_rr:
- case Hexagon::L4_ploadrht_rr:
- case Hexagon::L4_ploadrhf_rr:
- case Hexagon::L4_ploadruht_rr:
- case Hexagon::L4_ploadruhf_rr:
- case Hexagon::L4_ploadrit_rr:
- case Hexagon::L4_ploadrif_rr:
- return true;
+
+// Return the cur value instruction for a given store.
+int HexagonInstrInfo::getDotCurOp(const MachineInstr* MI) const {
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unknown .cur type");
+ case Hexagon::V6_vL32b_pi:
+ return Hexagon::V6_vL32b_cur_pi;
+ case Hexagon::V6_vL32b_ai:
+ return Hexagon::V6_vL32b_cur_ai;
+ //128B
+ case Hexagon::V6_vL32b_pi_128B:
+ return Hexagon::V6_vL32b_cur_pi_128B;
+ case Hexagon::V6_vL32b_ai_128B:
+ return Hexagon::V6_vL32b_cur_ai_128B;
}
+ return 0;
}
-// Returns true if an instruction is a conditional store.
-//
-// Note: It doesn't include conditional new-value stores as they can't be
-// converted to .new predicate.
+
+
+// The diagram below shows the steps involved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
//
// p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
// ^ ^
@@ -1524,8 +2998,6 @@ isConditionalLoad (const MachineInstr* MI) const {
// p.old store
// [if (p0)memw(R0+#0)=R2]
//
-// The above diagram shows the steps involoved in the conversion of a predicated
-// store instruction to its .new predicated new-value form.
//
// The following set of instructions further explains the scenario where
// conditional new-value store becomes invalid when promoted to .new predicate
@@ -1538,105 +3010,33 @@ isConditionalLoad (const MachineInstr* MI) const {
// the first two instructions because in instr 1, r0 is conditional on old value
// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
// is not valid for new-value stores.
-bool HexagonInstrInfo::
-isConditionalStore (const MachineInstr* MI) const {
- switch (MI->getOpcode())
- {
- default: return false;
- case Hexagon::S4_storeirbt_io:
- case Hexagon::S4_storeirbf_io:
- case Hexagon::S4_pstorerbt_rr:
- case Hexagon::S4_pstorerbf_rr:
- case Hexagon::S2_pstorerbt_io:
- case Hexagon::S2_pstorerbf_io:
- case Hexagon::S2_pstorerbt_pi:
- case Hexagon::S2_pstorerbf_pi:
- case Hexagon::S2_pstorerdt_io:
- case Hexagon::S2_pstorerdf_io:
- case Hexagon::S4_pstorerdt_rr:
- case Hexagon::S4_pstorerdf_rr:
- case Hexagon::S2_pstorerdt_pi:
- case Hexagon::S2_pstorerdf_pi:
- case Hexagon::S2_pstorerht_io:
- case Hexagon::S2_pstorerhf_io:
- case Hexagon::S4_storeirht_io:
- case Hexagon::S4_storeirhf_io:
- case Hexagon::S4_pstorerht_rr:
- case Hexagon::S4_pstorerhf_rr:
- case Hexagon::S2_pstorerht_pi:
- case Hexagon::S2_pstorerhf_pi:
- case Hexagon::S2_pstorerit_io:
- case Hexagon::S2_pstorerif_io:
- case Hexagon::S4_storeirit_io:
- case Hexagon::S4_storeirif_io:
- case Hexagon::S4_pstorerit_rr:
- case Hexagon::S4_pstorerif_rr:
- case Hexagon::S2_pstorerit_pi:
- case Hexagon::S2_pstorerif_pi:
-
- // V4 global address store before promoting to dot new.
- case Hexagon::S4_pstorerdt_abs:
- case Hexagon::S4_pstorerdf_abs:
- case Hexagon::S4_pstorerbt_abs:
- case Hexagon::S4_pstorerbf_abs:
- case Hexagon::S4_pstorerht_abs:
- case Hexagon::S4_pstorerhf_abs:
- case Hexagon::S4_pstorerit_abs:
- case Hexagon::S4_pstorerif_abs:
- return true;
-
- // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
- // from the "Conditional Store" list. Because a predicated new value store
- // would NOT be promoted to a double dot new store. See diagram below:
- // This function returns yes for those stores that are predicated but not
- // yet promoted to predicate dot new instructions.
- //
- // +---------------------+
- // /-----| if (p0) memw(..)=r0 |---------\~
- // || +---------------------+ ||
- // promote || /\ /\ || promote
- // || /||\ /||\ ||
- // \||/ demote || \||/
- // \/ || || \/
- // +-------------------------+ || +-------------------------+
- // | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new |
- // +-------------------------+ || +-------------------------+
- // || || ||
- // || demote \||/
- // promote || \/ NOT possible
- // || || /\~
- // \||/ || /||\~
- // \/ || ||
- // +-----------------------------+
- // | if (p0.new) memw(..)=r0.new |
- // +-----------------------------+
- // Double Dot New Store
- //
- }
-}
-
-
-bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
- if (isNewValue(MI) && isBranch(MI))
- return true;
- return false;
-}
-
-bool HexagonInstrInfo::isNewValueJump(Opcode_t Opcode) const {
- return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
-}
-
-bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
- return (getAddrMode(MI) == HexagonII::PostInc);
-}
-
-// Returns true, if any one of the operands is a dot new
-// insn, whether it is predicated dot new or register dot new.
-bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
- return (isNewValueInst(MI) ||
- (isPredicated(MI) && isPredicatedNew(MI)));
-}
-
+// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+// from the "Conditional Store" list. Because a predicated new value store
+// would NOT be promoted to a double dot new store. See diagram below:
+// This function returns yes for those stores that are predicated but not
+// yet promoted to predicate dot new instructions.
+//
+// +---------------------+
+// /-----| if (p0) memw(..)=r0 |---------\~
+// || +---------------------+ ||
+// promote || /\ /\ || promote
+// || /||\ /||\ ||
+// \||/ demote || \||/
+// \/ || || \/
+// +-------------------------+ || +-------------------------+
+// | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new |
+// +-------------------------+ || +-------------------------+
+// || || ||
+// || demote \||/
+// promote || \/ NOT possible
+// || || /\~
+// \||/ || /||\~
+// \/ || ||
+// +-----------------------------+
+// | if (p0.new) memw(..)=r0.new |
+// +-----------------------------+
+// Double Dot New Store
+//
// Returns the most basic instruction for the .new predicated instructions and
// new-value stores.
// For example, all of the following instructions will be converted back to the
@@ -1645,24 +3045,23 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
// 2) if (p0) memw(R0+#0)= R1.new -------> if (p0) memw(R0+#0) = R1
// 3) if (p0.new) memw(R0+#0) = R1 --->
//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+// if (p0.new) R2 = add(R3, R4)
+// R5 = add (R3, R1)
+// }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
-int HexagonInstrInfo::GetDotOldOp(const int opc) const {
- int NewOp = opc;
- if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
- NewOp = Hexagon::getPredOldOpcode(NewOp);
- assert(NewOp >= 0 &&
- "Couldn't change predicate new instruction to its old form.");
- }
-
- if (isNewValueStore(NewOp)) { // Convert into non-new-value format
- NewOp = Hexagon::getNonNVStore(NewOp);
- assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
- }
- return NewOp;
-}
// Return the new value instruction for a given store.
-int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
+int HexagonInstrInfo::getDotNewOp(const MachineInstr* MI) const {
int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode());
if (NVOpcode >= 0) // Valid new-value store instruction.
return NVOpcode;
@@ -1672,12 +3071,6 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
case Hexagon::S4_storerb_ur:
return Hexagon::S4_storerbnew_ur;
- case Hexagon::S4_storerh_ur:
- return Hexagon::S4_storerhnew_ur;
-
- case Hexagon::S4_storeri_ur:
- return Hexagon::S4_storerinew_ur;
-
case Hexagon::S2_storerb_pci:
return Hexagon::S2_storerb_pci;
@@ -1692,203 +3085,496 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
case Hexagon::S2_storerf_pci:
return Hexagon::S2_storerf_pci;
+
+ case Hexagon::V6_vS32b_ai:
+ return Hexagon::V6_vS32b_new_ai;
+
+ case Hexagon::V6_vS32b_pi:
+ return Hexagon::V6_vS32b_new_pi;
+
+ // 128B
+ case Hexagon::V6_vS32b_ai_128B:
+ return Hexagon::V6_vS32b_new_ai_128B;
+
+ case Hexagon::V6_vS32b_pi_128B:
+ return Hexagon::V6_vS32b_new_pi_128B;
}
return 0;
}
-// Return .new predicate version for an instruction.
-int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
- const MachineBranchProbabilityInfo
- *MBPI) const {
+// Returns the opcode to use when converting MI, which is a conditional jump,
+// into a conditional instruction which uses the .new value of the predicate.
+// We also use branch probabilities to add a hint to the jump.
+int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr *MI,
+ const MachineBranchProbabilityInfo *MBPI) const {
+ // We assume that block can have at most two successors.
+ bool taken = false;
+ const MachineBasicBlock *Src = MI->getParent();
+ const MachineOperand *BrTarget = &MI->getOperand(1);
+ const MachineBasicBlock *Dst = BrTarget->getMBB();
+ const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
+ if (Prediction >= BranchProbability(1,2))
+ taken = true;
+
+ switch (MI->getOpcode()) {
+ case Hexagon::J2_jumpt:
+ return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+ case Hexagon::J2_jumpf:
+ return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+
+ default:
+ llvm_unreachable("Unexpected jump instruction.");
+ }
+}
+
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::getDotNewPredOp(const MachineInstr *MI,
+ const MachineBranchProbabilityInfo *MBPI) const {
int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
if (NewOpcode >= 0) // Valid predicate new instruction
return NewOpcode;
switch (MI->getOpcode()) {
- default: llvm_unreachable("Unknown .new type");
// Condtional Jumps
case Hexagon::J2_jumpt:
case Hexagon::J2_jumpf:
return getDotNewPredJumpOp(MI, MBPI);
- case Hexagon::J2_jumprt:
- return Hexagon::J2_jumptnewpt;
-
- case Hexagon::J2_jumprf:
- return Hexagon::J2_jumprfnewpt;
-
- case Hexagon::JMPrett:
- return Hexagon::J2_jumprtnewpt;
-
- case Hexagon::JMPretf:
- return Hexagon::J2_jumprfnewpt;
-
-
- // Conditional combine
- case Hexagon::C2_ccombinewt:
- return Hexagon::C2_ccombinewnewt;
- case Hexagon::C2_ccombinewf:
- return Hexagon::C2_ccombinewnewf;
+ default:
+ assert(0 && "Unknown .new type");
}
+ return 0;
}
-unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
+int HexagonInstrInfo::getDotOldOp(const int opc) const {
+ int NewOp = opc;
+ if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+ NewOp = Hexagon::getPredOldOpcode(NewOp);
+ assert(NewOp >= 0 &&
+ "Couldn't change predicate new instruction to its old form.");
+ }
- return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask);
+ if (isNewValueStore(NewOp)) { // Convert into non-new-value format
+ NewOp = Hexagon::getNonNVStore(NewOp);
+ assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
+ }
+ return NewOp;
}
-/// immediateExtend - Changes the instruction in place to one using an immediate
-/// extender.
-void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
- assert((isExtendable(MI)||isConstExtended(MI)) &&
- "Instruction must be extendable");
- // Find which operand is extendable.
- short ExtOpNum = getCExtOpNum(MI);
- MachineOperand &MO = MI->getOperand(ExtOpNum);
- // This needs to be something we understand.
- assert((MO.isMBB() || MO.isImm()) &&
- "Branch with unknown extendable field type");
- // Mark given operand as extended.
- MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
-}
-DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
- const TargetSubtargetInfo &STI) const {
- const InstrItineraryData *II = STI.getInstrItineraryData();
- return static_cast<const HexagonSubtarget &>(STI).createDFAPacketizer(II);
-}
-
-bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
- const MachineBasicBlock *MBB,
- const MachineFunction &MF) const {
- // Debug info is never a scheduling boundary. It's necessary to be explicit
- // due to the special treatment of IT instructions below, otherwise a
- // dbg_value followed by an IT will result in the IT instruction being
- // considered a scheduling hazard, which is wrong. It should be the actual
- // instruction preceding the dbg_value instruction(s), just like it is
- // when debug info is not present.
- if (MI->isDebugValue())
- return false;
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
+ const MachineInstr *MI) const {
+ unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+ auto &HRI = getRegisterInfo();
- // Terminators and labels can't be scheduled around.
- if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm())
- return true;
+ switch (MI->getOpcode()) {
+ default:
+ return HexagonII::HSIG_None;
+ //
+ // Group L1:
+ //
+ // Rd = memw(Rs+#u4:2)
+ // Rd = memub(Rs+#u4:0)
+ case Hexagon::L2_loadri_io:
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ // Special case this one from Group L2.
+ // Rd = memw(r29+#u5:2)
+ if (isIntRegForSubInst(DstReg)) {
+ if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg &&
+ MI->getOperand(2).isImm() &&
+ isShiftedUInt<5,2>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ // Rd = memw(Rs+#u4:2)
+ if (isIntRegForSubInst(SrcReg) &&
+ (MI->getOperand(2).isImm() &&
+ isShiftedUInt<4,2>(MI->getOperand(2).getImm())))
+ return HexagonII::HSIG_L1;
+ }
+ break;
+ case Hexagon::L2_loadrub_io:
+ // Rd = memub(Rs+#u4:0)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI->getOperand(2).isImm() && isUInt<4>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_L1;
+ break;
+ //
+ // Group L2:
+ //
+ // Rd = memh/memuh(Rs+#u3:1)
+ // Rd = memb(Rs+#u3:0)
+ // Rd = memw(r29+#u5:2) - Handled above.
+ // Rdd = memd(r29+#u5:3)
+ // deallocframe
+ // [if ([!]p0[.new])] dealloc_return
+ // [if ([!]p0[.new])] jumpr r31
+ case Hexagon::L2_loadrh_io:
+ case Hexagon::L2_loadruh_io:
+ // Rd = memh/memuh(Rs+#u3:1)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI->getOperand(2).isImm() &&
+ isShiftedUInt<3,1>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L2_loadrb_io:
+ // Rd = memb(Rs+#u3:0)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI->getOperand(2).isImm() &&
+ isUInt<3>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L2_loadrd_io:
+ // Rdd = memd(r29+#u5:3)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) &&
+ Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg &&
+ MI->getOperand(2).isImm() &&
+ isShiftedUInt<5,3>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_L2;
+ break;
+ // dealloc_return is not documented in Hexagon Manual, but marked
+ // with A_SUBINSN attribute in iset_v4classic.py.
+ case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+ case Hexagon::L4_return:
+ case Hexagon::L2_deallocframe:
+ return HexagonII::HSIG_L2;
+ case Hexagon::EH_RETURN_JMPR:
+ case Hexagon::JMPret :
+ // jumpr r31
+ // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+ DstReg = MI->getOperand(0).getReg();
+ if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::JMPrett:
+ case Hexagon::JMPretf:
+ case Hexagon::JMPrettnewpt:
+ case Hexagon::JMPretfnewpt :
+ case Hexagon::JMPrettnew :
+ case Hexagon::JMPretfnew :
+ DstReg = MI->getOperand(1).getReg();
+ SrcReg = MI->getOperand(0).getReg();
+ // [if ([!]p0[.new])] jumpr r31
+ if ((Hexagon::PredRegsRegClass.contains(SrcReg) &&
+ (Hexagon::P0 == SrcReg)) &&
+ (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)))
+ return HexagonII::HSIG_L2;
+ break;
+ case Hexagon::L4_return_t :
+ case Hexagon::L4_return_f :
+ case Hexagon::L4_return_tnew_pnt :
+ case Hexagon::L4_return_fnew_pnt :
+ case Hexagon::L4_return_tnew_pt :
+ case Hexagon::L4_return_fnew_pt :
+ // [if ([!]p0[.new])] dealloc_return
+ SrcReg = MI->getOperand(0).getReg();
+ if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg))
+ return HexagonII::HSIG_L2;
+ break;
+ //
+ // Group S1:
+ //
+ // memw(Rs+#u4:2) = Rt
+ // memb(Rs+#u4:0) = Rt
+ case Hexagon::S2_storeri_io:
+ // Special case this one from Group S2.
+ // memw(r29+#u5:2) = Rt
+ Src1Reg = MI->getOperand(0).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+ isIntRegForSubInst(Src2Reg) &&
+ HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() &&
+ isShiftedUInt<5,2>(MI->getOperand(1).getImm()))
+ return HexagonII::HSIG_S2;
+ // memw(Rs+#u4:2) = Rt
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI->getOperand(1).isImm() &&
+ isShiftedUInt<4,2>(MI->getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ case Hexagon::S2_storerb_io:
+ // memb(Rs+#u4:0) = Rt
+ Src1Reg = MI->getOperand(0).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI->getOperand(1).isImm() && isUInt<4>(MI->getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ //
+ // Group S2:
+ //
+ // memh(Rs+#u3:1) = Rt
+ // memw(r29+#u5:2) = Rt
+ // memd(r29+#s6:3) = Rtt
+ // memw(Rs+#u4:2) = #U1
+ // memb(Rs+#u4) = #U1
+ // allocframe(#u5:3)
+ case Hexagon::S2_storerh_io:
+ // memh(Rs+#u3:1) = Rt
+ Src1Reg = MI->getOperand(0).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+ MI->getOperand(1).isImm() &&
+ isShiftedUInt<3,1>(MI->getOperand(1).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ case Hexagon::S2_storerd_io:
+ // memd(r29+#s6:3) = Rtt
+ Src1Reg = MI->getOperand(0).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (isDblRegForSubInst(Src2Reg, HRI) &&
+ Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+ HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() &&
+ isShiftedInt<6,3>(MI->getOperand(1).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S4_storeiri_io:
+ // memw(Rs+#u4:2) = #U1
+ Src1Reg = MI->getOperand(0).getReg();
+ if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() &&
+ isShiftedUInt<4,2>(MI->getOperand(1).getImm()) &&
+ MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S4_storeirb_io:
+ // memb(Rs+#u4) = #U1
+ Src1Reg = MI->getOperand(0).getReg();
+ if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() &&
+ isUInt<4>(MI->getOperand(1).getImm()) && MI->getOperand(2).isImm() &&
+ MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_S2;
+ break;
+ case Hexagon::S2_allocframe:
+ if (MI->getOperand(0).isImm() &&
+ isShiftedUInt<5,3>(MI->getOperand(0).getImm()))
+ return HexagonII::HSIG_S1;
+ break;
+ //
+ // Group A:
+ //
+ // Rx = add(Rx,#s7)
+ // Rd = Rs
+ // Rd = #u6
+ // Rd = #-1
+ // if ([!]P0[.new]) Rd = #0
+ // Rd = add(r29,#u6:2)
+ // Rx = add(Rx,Rs)
+ // P0 = cmp.eq(Rs,#u2)
+ // Rdd = combine(#0,Rs)
+ // Rdd = combine(Rs,#0)
+ // Rdd = combine(#u2,#U2)
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ // Rd = and(Rs,#1)
+ case Hexagon::A2_addi:
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg)) {
+ // Rd = add(r29,#u6:2)
+ if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+ HRI.getStackRegister() == SrcReg && MI->getOperand(2).isImm() &&
+ isShiftedUInt<6,2>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ // Rx = add(Rx,#s7)
+ if ((DstReg == SrcReg) && MI->getOperand(2).isImm() &&
+ isInt<7>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ // Rd = add(Rs,#1)
+ // Rd = add(Rs,#-1)
+ if (isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() &&
+ ((MI->getOperand(2).getImm() == 1) ||
+ (MI->getOperand(2).getImm() == -1)))
+ return HexagonII::HSIG_A;
+ }
+ break;
+ case Hexagon::A2_add:
+ // Rx = add(Rx,Rs)
+ DstReg = MI->getOperand(0).getReg();
+ Src1Reg = MI->getOperand(1).getReg();
+ Src2Reg = MI->getOperand(2).getReg();
+ if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+ isIntRegForSubInst(Src2Reg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_andir:
+ // Same as zxtb.
+ // Rd16=and(Rs16,#255)
+ // Rd16=and(Rs16,#1)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+ MI->getOperand(2).isImm() &&
+ ((MI->getOperand(2).getImm() == 1) ||
+ (MI->getOperand(2).getImm() == 255)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_tfr:
+ // Rd = Rs
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_tfrsi:
+ // Rd = #u6
+ // Do not test for #u6 size since the const is getting extended
+ // regardless and compound could be formed.
+ // Rd = #-1
+ DstReg = MI->getOperand(0).getReg();
+ if (isIntRegForSubInst(DstReg))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::C2_cmoveit:
+ case Hexagon::C2_cmovenewit:
+ case Hexagon::C2_cmoveif:
+ case Hexagon::C2_cmovenewif:
+ // if ([!]P0[.new]) Rd = #0
+ // Actual form:
+ // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) &&
+ Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg &&
+ MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::C2_cmpeqi:
+ // P0 = cmp.eq(Rs,#u2)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+ Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) &&
+ MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm()))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ // Rdd = combine(#u2,#U2)
+ DstReg = MI->getOperand(0).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) &&
+ ((MI->getOperand(1).isImm() && isUInt<2>(MI->getOperand(1).getImm())) ||
+ (MI->getOperand(1).isGlobal() &&
+ isUInt<2>(MI->getOperand(1).getOffset()))) &&
+ ((MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) ||
+ (MI->getOperand(2).isGlobal() &&
+ isUInt<2>(MI->getOperand(2).getOffset()))))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A4_combineri:
+ // Rdd = combine(Rs,#0)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+ ((MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) ||
+ (MI->getOperand(2).isGlobal() && MI->getOperand(2).getOffset() == 0)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A4_combineir:
+ // Rdd = combine(#0,Rs)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(2).getReg();
+ if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+ ((MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) ||
+ (MI->getOperand(1).isGlobal() && MI->getOperand(1).getOffset() == 0)))
+ return HexagonII::HSIG_A;
+ break;
+ case Hexagon::A2_sxtb:
+ case Hexagon::A2_sxth:
+ case Hexagon::A2_zxtb:
+ case Hexagon::A2_zxth:
+ // Rd = sxth/sxtb/zxtb/zxth(Rs)
+ DstReg = MI->getOperand(0).getReg();
+ SrcReg = MI->getOperand(1).getReg();
+ if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+ return HexagonII::HSIG_A;
+ break;
+ }
- return false;
+ return HexagonII::HSIG_None;
}
-bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
- unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
- if (isExtended) // Instruction must be extended.
- return true;
- unsigned isExtendable =
- (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
- if (!isExtendable)
- return false;
-
- short ExtOpNum = getCExtOpNum(MI);
- const MachineOperand &MO = MI->getOperand(ExtOpNum);
- // Use MO operand flags to determine if MO
- // has the HMOTF_ConstExtended flag set.
- if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
- return true;
- // If this is a Machine BB address we are talking about, and it is
- // not marked as extended, say so.
- if (MO.isMBB())
- return false;
-
- // We could be using an instruction with an extendable immediate and shoehorn
- // a global address into it. If it is a global address it will be constant
- // extended. We do this for COMBINE.
- // We currently only handle isGlobal() because it is the only kind of
- // object we are going to end up with here for now.
- // In the future we probably should add isSymbol(), etc.
- if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
- MO.isJTI() || MO.isCPI())
- return true;
-
- // If the extendable operand is not 'Immediate' type, the instruction should
- // have 'isExtended' flag set.
- assert(MO.isImm() && "Extendable operand must be Immediate type");
+short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr *MI) const {
+ return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Real);
+}
- int MinValue = getMinValue(MI);
- int MaxValue = getMaxValue(MI);
- int ImmValue = MO.getImm();
- return (ImmValue < MinValue || ImmValue > MaxValue);
+// Return first non-debug instruction in the basic block.
+MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
+ const {
+ for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) {
+ MachineInstr *MI = &*MII;
+ if (MI->isDebugValue())
+ continue;
+ return MI;
+ }
+ return nullptr;
}
-// Return the number of bytes required to encode the instruction.
-// Hexagon instructions are fixed length, 4 bytes, unless they
-// use a constant extender, which requires another 4 bytes.
-// For debug instructions and prolog labels, return 0.
-unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const {
- if (MI->isDebugValue() || MI->isPosition())
- return 0;
+unsigned HexagonInstrInfo::getInstrTimingClassLatency(
+ const InstrItineraryData *ItinData, const MachineInstr *MI) const {
+ // Default to one cycle for no itinerary. However, an "empty" itinerary may
+ // still have a MinLatency property, which getStageLatency checks.
+ if (!ItinData)
+ return getInstrLatency(ItinData, MI);
- unsigned Size = MI->getDesc().getSize();
- if (!Size)
- // Assume the default insn size in case it cannot be determined
- // for whatever reason.
- Size = HEXAGON_INSTR_SIZE;
-
- if (isConstExtended(MI) || isExtended(MI))
- Size += HEXAGON_INSTR_SIZE;
-
- return Size;
+ // Get the latency embedded in the itinerary. If we're not using timing class
+ // latencies or if we using BSB scheduling, then restrict the maximum latency
+ // to 1 (that is, either 0 or 1).
+ if (MI->isTransient())
+ return 0;
+ unsigned Latency = ItinData->getStageLatency(MI->getDesc().getSchedClass());
+ if (!EnableTimingClassLatency ||
+ MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>().
+ useBSBScheduling())
+ if (Latency > 1)
+ Latency = 1;
+ return Latency;
}
-// Returns the opcode to use when converting MI, which is a conditional jump,
-// into a conditional instruction which uses the .new value of the predicate.
-// We also use branch probabilities to add a hint to the jump.
-int
-HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI,
- const
- MachineBranchProbabilityInfo *MBPI) const {
-
- // We assume that block can have at most two successors.
- bool taken = false;
- MachineBasicBlock *Src = MI->getParent();
- MachineOperand *BrTarget = &MI->getOperand(1);
- MachineBasicBlock *Dst = BrTarget->getMBB();
- const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
- if (Prediction >= BranchProbability(1,2))
- taken = true;
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::getInvertedPredSense(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ if (Cond.empty())
+ return false;
+ unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm());
+ Cond[0].setImm(Opc);
+ return true;
+}
- switch (MI->getOpcode()) {
- case Hexagon::J2_jumpt:
- return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
- case Hexagon::J2_jumpf:
- return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
- default:
- llvm_unreachable("Unexpected jump instruction.");
- }
-}
-// Returns true if a particular operand is extendable for an instruction.
-bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
- unsigned short OperandNum) const {
- const uint64_t F = MI->getDesc().TSFlags;
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+ int InvPredOpcode;
+ InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+ : Hexagon::getTruePredOpcode(Opc);
+ if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+ return InvPredOpcode;
- return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
- == OperandNum;
+ llvm_unreachable("Unexpected predicated instruction");
}
-// Returns Operand Index for the constant extended instruction.
-unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
- const uint64_t F = MI->getDesc().TSFlags;
- return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
-// Returns the min value that doesn't need to be extended.
-int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
const uint64_t F = MI->getDesc().TSFlags;
unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
& HexagonII::ExtentSignedMask;
@@ -1896,13 +3582,20 @@ int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
& HexagonII::ExtentBitsMask;
if (isSigned) // if value is signed
- return -1U << (bits - 1);
+ return ~(-1U << (bits - 1));
else
- return 0;
+ return ~(-1U << bits);
}
-// Returns the max value that doesn't need to be extended.
-int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
+
+unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr* MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask;
+}
+
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
const uint64_t F = MI->getDesc().TSFlags;
unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
& HexagonII::ExtentSignedMask;
@@ -1910,49 +3603,14 @@ int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
& HexagonII::ExtentBitsMask;
if (isSigned) // if value is signed
- return ~(-1U << (bits - 1));
+ return -1U << (bits - 1);
else
- return ~(-1U << bits);
+ return 0;
}
-// Returns true if an instruction can be converted into a non-extended
-// equivalent instruction.
-bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
-
- short NonExtOpcode;
- // Check if the instruction has a register form that uses register in place
- // of the extended operand, if so return that as the non-extended form.
- if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
- return true;
-
- if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
- // Check addressing mode and retrieve non-ext equivalent instruction.
-
- switch (getAddrMode(MI)) {
- case HexagonII::Absolute :
- // Load/store with absolute addressing mode can be converted into
- // base+offset mode.
- NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode());
- break;
- case HexagonII::BaseImmOffset :
- // Load/store with base+offset addressing mode can be converted into
- // base+register offset addressing mode. However left shift operand should
- // be set to 0.
- NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
- break;
- default:
- return false;
- }
- if (NonExtOpcode < 0)
- return false;
- return true;
- }
- return false;
-}
// Returns opcode of the non-extended equivalent instruction.
-short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
-
+short HexagonInstrInfo::getNonExtOpcode(const MachineInstr *MI) const {
// Check if the instruction has a register form that uses register in place
// of the extended operand, if so return that as the non-extended form.
short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode());
@@ -1963,9 +3621,12 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
// Check addressing mode and retrieve non-ext equivalent instruction.
switch (getAddrMode(MI)) {
case HexagonII::Absolute :
- return Hexagon::getBasedWithImmOffset(MI->getOpcode());
+ return Hexagon::getBaseWithImmOffset(MI->getOpcode());
case HexagonII::BaseImmOffset :
return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+ case HexagonII::BaseLongOffset:
+ return Hexagon::getRegShlForm(MI->getOpcode());
+
default:
return -1;
}
@@ -1973,29 +3634,9 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
return -1;
}
-bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const {
- return (Opcode == Hexagon::J2_jumpt) ||
- (Opcode == Hexagon::J2_jumpf) ||
- (Opcode == Hexagon::J2_jumptnewpt) ||
- (Opcode == Hexagon::J2_jumpfnewpt) ||
- (Opcode == Hexagon::J2_jumpt) ||
- (Opcode == Hexagon::J2_jumpf);
-}
-
-bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
- if (Cond.empty() || !isPredicated(Cond[0].getImm()))
- return false;
- return !isPredicatedTrue(Cond[0].getImm());
-}
-
-bool HexagonInstrInfo::isEndLoopN(Opcode_t Opcode) const {
- return (Opcode == Hexagon::ENDLOOP0 ||
- Opcode == Hexagon::ENDLOOP1);
-}
bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
- unsigned &PredReg, unsigned &PredRegPos,
- unsigned &PredRegFlags) const {
+ unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const {
if (Cond.empty())
return false;
assert(Cond.size() == 2);
@@ -2014,3 +3655,174 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
return true;
}
+
+short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr *MI) const {
+ return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Pseudo);
+}
+
+
+short HexagonInstrInfo::getRegForm(const MachineInstr *MI) const {
+ return Hexagon::getRegForm(MI->getOpcode());
+}
+
+
+// Return the number of bytes required to encode the instruction.
+// Hexagon instructions are fixed length, 4 bytes, unless they
+// use a constant extender, which requires another 4 bytes.
+// For debug instructions and prolog labels, return 0.
+unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const {
+ if (MI->isDebugValue() || MI->isPosition())
+ return 0;
+
+ unsigned Size = MI->getDesc().getSize();
+ if (!Size)
+ // Assume the default insn size in case it cannot be determined
+ // for whatever reason.
+ Size = HEXAGON_INSTR_SIZE;
+
+ if (isConstExtended(MI) || isExtended(MI))
+ Size += HEXAGON_INSTR_SIZE;
+
+ // Try and compute number of instructions in asm.
+ if (BranchRelaxAsmLarge && MI->getOpcode() == Hexagon::INLINEASM) {
+ const MachineBasicBlock &MBB = *MI->getParent();
+ const MachineFunction *MF = MBB.getParent();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+ // Count the number of register definitions to find the asm string.
+ unsigned NumDefs = 0;
+ for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+ ++NumDefs)
+ assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+ assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+ // Disassemble the AsmStr and approximate number of instructions.
+ const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+ Size = getInlineAsmLength(AsmStr, *MAI);
+ }
+
+ return Size;
+}
+
+
+uint64_t HexagonInstrInfo::getType(const MachineInstr* MI) const {
+ const uint64_t F = MI->getDesc().TSFlags;
+ return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
+}
+
+
+unsigned HexagonInstrInfo::getUnits(const MachineInstr* MI) const {
+ const TargetSubtargetInfo &ST = MI->getParent()->getParent()->getSubtarget();
+ const InstrItineraryData &II = *ST.getInstrItineraryData();
+ const InstrStage &IS = *II.beginStage(MI->getDesc().getSchedClass());
+
+ return IS.getUnits();
+}
+
+
+unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
+ const uint64_t F = get(Opcode).TSFlags;
+ return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
+}
+
+
+// Calculate size of the basic block without debug instructions.
+unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
+ return nonDbgMICount(BB->instr_begin(), BB->instr_end());
+}
+
+
+unsigned HexagonInstrInfo::nonDbgBundleSize(
+ MachineBasicBlock::const_iterator BundleHead) const {
+ assert(BundleHead->isBundle() && "Not a bundle header");
+ auto MII = BundleHead.getInstrIterator();
+ // Skip the bundle header.
+ return nonDbgMICount(++MII, getBundleEnd(BundleHead));
+}
+
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
+ assert((isExtendable(MI)||isConstExtended(MI)) &&
+ "Instruction must be extendable");
+ // Find which operand is extendable.
+ short ExtOpNum = getCExtOpNum(MI);
+ MachineOperand &MO = MI->getOperand(ExtOpNum);
+ // This needs to be something we understand.
+ assert((MO.isMBB() || MO.isImm()) &&
+ "Branch with unknown extendable field type");
+ // Mark given operand as extended.
+ MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+
+bool HexagonInstrInfo::invertAndChangeJumpTarget(
+ MachineInstr* MI, MachineBasicBlock* NewTarget) const {
+ DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#"
+ << NewTarget->getNumber(); MI->dump(););
+ assert(MI->isBranch());
+ unsigned NewOpcode = getInvertedPredicatedOpcode(MI->getOpcode());
+ int TargetPos = MI->getNumOperands() - 1;
+ // In general branch target is the last operand,
+ // but some implicit defs added at the end might change it.
+ while ((TargetPos > -1) && !MI->getOperand(TargetPos).isMBB())
+ --TargetPos;
+ assert((TargetPos >= 0) && MI->getOperand(TargetPos).isMBB());
+ MI->getOperand(TargetPos).setMBB(NewTarget);
+ if (EnableBranchPrediction && isPredicatedNew(MI)) {
+ NewOpcode = reversePrediction(NewOpcode);
+ }
+ MI->setDesc(get(NewOpcode));
+ return true;
+}
+
+
+void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
+ /* +++ The code below is used to generate complete set of Hexagon Insn +++ */
+ MachineFunction::iterator A = MF.begin();
+ MachineBasicBlock &B = *A;
+ MachineBasicBlock::iterator I = B.begin();
+ MachineInstr *MI = &*I;
+ DebugLoc DL = MI->getDebugLoc();
+ MachineInstr *NewMI;
+
+ for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
+ insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
+ NewMI = BuildMI(B, MI, DL, get(insn));
+ DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
+ " Class: " << NewMI->getDesc().getSchedClass());
+ NewMI->eraseFromParent();
+ }
+ /* --- The code above is used to generate complete set of Hexagon Insn --- */
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::reversePredSense(MachineInstr* MI) const {
+ DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI->dump());
+ MI->setDesc(get(getInvertedPredicatedOpcode(MI->getOpcode())));
+ return true;
+}
+
+
+// Reverse the branch prediction.
+unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
+ int PredRevOpcode = -1;
+ if (isPredictedTaken(Opcode))
+ PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode);
+ else
+ PredRevOpcode = Hexagon::takenBranchPrediction(Opcode);
+ assert(PredRevOpcode > 0);
+ return PredRevOpcode;
+}
+
+
+// TODO: Add more rigorous validation.
+bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
+ const {
+ return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
+}
+
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index d0b8a4631c1d..9530d9f2aa0d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -1,4 +1,3 @@
-
//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
@@ -28,23 +27,18 @@ namespace llvm {
struct EVT;
class HexagonSubtarget;
+
class HexagonInstrInfo : public HexagonGenInstrInfo {
virtual void anchor();
const HexagonRegisterInfo RI;
- const HexagonSubtarget &Subtarget;
public:
- typedef unsigned Opcode_t;
-
explicit HexagonInstrInfo(HexagonSubtarget &ST);
- /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
- /// such, whenever a client has an instance of instruction info, it should
- /// always be able to get register info as well (through this method).
+ /// TargetInstrInfo overrides.
///
- const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
- /// isLoadFromStackSlot - If the specified machine instruction is a direct
+ /// If the specified machine instruction is a direct
/// load from a stack slot, return the virtual or physical register number of
/// the destination along with the FrameIndex of the loaded stack slot. If
/// not, return 0. This predicate must return 0 if the instruction has
@@ -52,7 +46,7 @@ public:
unsigned isLoadFromStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;
- /// isStoreToStackSlot - If the specified machine instruction is a direct
+ /// If the specified machine instruction is a direct
/// store to a stack slot, return the virtual or physical register number of
/// the source reg along with the FrameIndex of the loaded stack slot. If
/// not, return 0. This predicate must return 0 if the instruction has
@@ -60,50 +54,118 @@ public:
unsigned isStoreToStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;
-
+ /// Analyze the branching code at the end of MBB, returning
+ /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+ /// implemented for a target). Upon success, this returns false and returns
+ /// with the following information in various cases:
+ ///
+ /// 1. If this block ends with no branches (it just falls through to its succ)
+ /// just return false, leaving TBB/FBB null.
+ /// 2. If this block ends with only an unconditional branch, it sets TBB to be
+ /// the destination block.
+ /// 3. If this block ends with a conditional branch and it falls through to a
+ /// successor block, it sets TBB to be the branch destination block and a
+ /// list of operands that evaluate the condition. These operands can be
+ /// passed to other TargetInstrInfo methods to create new branches.
+ /// 4. If this block ends with a conditional branch followed by an
+ /// unconditional branch, it returns the 'true' destination in TBB, the
+ /// 'false' destination in FBB, and a list of operands that evaluate the
+ /// condition. These operands can be passed to other TargetInstrInfo
+ /// methods to create new branches.
+ ///
+ /// Note that RemoveBranch and InsertBranch must be implemented to support
+ /// cases where this method returns success.
+ ///
+ /// If AllowModify is true, then this routine is allowed to modify the basic
+ /// block (e.g. delete instructions after the unconditional branch).
+ ///
bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
+ /// Remove the branching code at the end of the specific MBB.
+ /// This is only invoked in cases where AnalyzeBranch returns success. It
+ /// returns the number of instructions that were removed.
unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ /// Insert branch code into the end of the specified MachineBasicBlock.
+ /// The operands to this method are the same as those
+ /// returned by AnalyzeBranch. This is only invoked in cases where
+ /// AnalyzeBranch returns success. It returns the number of instructions
+ /// inserted.
+ ///
+ /// It is also invoked by tail merging to add unconditional branches in
+ /// cases where AnalyzeBranch doesn't apply because there was no original
+ /// branch to analyze. At least this much must be implemented, else tail
+ /// merging needs to be disabled.
unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
DebugLoc DL) const override;
- bool analyzeCompare(const MachineInstr *MI,
- unsigned &SrcReg, unsigned &SrcReg2,
- int &Mask, int &Value) const override;
+ /// Return true if it's profitable to predicate
+ /// instructions with accumulated instruction latency of "NumCycles"
+ /// of the specified basic block, where the probability of the instructions
+ /// being executed is given by Probability, and Confidence is a measure
+ /// of our confidence that it will be properly predicted.
+ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ unsigned ExtraPredCycles,
+ BranchProbability Probability) const override;
+
+ /// Second variant of isProfitableToIfCvt. This one
+ /// checks for the case where two basic blocks from true and false path
+ /// of a if-then-else (diamond) are predicated on mutally exclusive
+ /// predicates, where the probability of the true path being taken is given
+ /// by Probability, and Confidence is a measure of our confidence that it
+ /// will be properly predicted.
+ bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+ unsigned NumTCycles, unsigned ExtraTCycles,
+ MachineBasicBlock &FMBB,
+ unsigned NumFCycles, unsigned ExtraFCycles,
+ BranchProbability Probability) const override;
+
+ /// Return true if it's profitable for if-converter to duplicate instructions
+ /// of specified accumulated instruction latencies in the specified MBB to
+ /// enable if-conversion.
+ /// The probability of the instructions being executed is given by
+ /// Probability, and Confidence is a measure of our confidence that it
+ /// will be properly predicted.
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override;
+ /// Emit instructions to copy a pair of physical registers.
+ ///
+ /// This function should support copies within any legal register class as
+ /// well as any cross-class copies created during instruction selection.
+ ///
+ /// The source and destination registers may overlap, which may require a
+ /// careful implementation when multiple copy instructions are required for
+ /// large registers. See for example the ARM target.
void copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ /// Store the specified register of the given register class to the specified
+ /// stack frame index. The store instruction is to be added to the given
+ /// machine basic block before the specified machine instruction. If isKill
+ /// is true, the register operand is the last use and must be marked kill.
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
+ /// Load the specified register of the given register class from the specified
+ /// stack frame index. The load instruction is to be added to the given
+ /// machine basic block before the specified machine instruction.
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
- /// expandPostRAPseudo - This function is called for all pseudo instructions
+ /// This function is called for all pseudo instructions
/// that remain after register allocation. Many pseudo instructions are
/// created to help register allocation. This is the place to convert them
/// into real instructions. The target can edit MI in place, or it can insert
@@ -111,122 +173,228 @@ public:
/// anything was changed.
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
- MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
- ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt,
- int FrameIndex) const override;
+ /// Reverses the branch condition of the specified condition list,
+ /// returning false on success and true if it cannot be reversed.
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+ const override;
- MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
- ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt,
- MachineInstr *LoadMI) const override {
- return nullptr;
- }
+ /// Insert a noop into the instruction stream at the specified point.
+ void insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
- unsigned createVR(MachineFunction* MF, MVT VT) const;
+ /// Returns true if the instruction is already predicated.
+ bool isPredicated(const MachineInstr *MI) const override;
- bool isBranch(const MachineInstr *MI) const;
- bool isPredicable(MachineInstr *MI) const override;
+ /// Convert the instruction into a predicated instruction.
+ /// It returns true if the operation was successful.
bool PredicateInstruction(MachineInstr *MI,
ArrayRef<MachineOperand> Cond) const override;
- bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
- unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override;
-
- bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
- unsigned NumTCycles, unsigned ExtraTCycles,
- MachineBasicBlock &FMBB,
- unsigned NumFCycles, unsigned ExtraFCycles,
- const BranchProbability &Probability) const override;
+ /// Returns true if the first specified predicate
+ /// subsumes the second, e.g. GE subsumes GT.
+ bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+ ArrayRef<MachineOperand> Pred2) const override;
- bool isPredicated(const MachineInstr *MI) const override;
- bool isPredicated(unsigned Opcode) const;
- bool isPredicatedTrue(const MachineInstr *MI) const;
- bool isPredicatedTrue(unsigned Opcode) const;
- bool isPredicatedNew(const MachineInstr *MI) const;
- bool isPredicatedNew(unsigned Opcode) const;
+ /// If the specified instruction defines any predicate
+ /// or condition code register(s) used for predication, returns true as well
+ /// as the definition predicate(s) by reference.
bool DefinesPredicate(MachineInstr *MI,
std::vector<MachineOperand> &Pred) const override;
- bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
- ArrayRef<MachineOperand> Pred2) const override;
- bool
- ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ /// Return true if the specified instruction can be predicated.
+ /// By default, this returns true for every instruction with a
+ /// PredicateOperand.
+ bool isPredicable(MachineInstr *MI) const override;
- bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
- const BranchProbability &Probability) const override;
+ /// Test if the given instruction should be considered a scheduling boundary.
+ /// This primarily includes labels and terminators.
+ bool isSchedulingBoundary(const MachineInstr *MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+ /// Measure the specified inline asm to determine an approximation of its
+ /// length.
+ unsigned getInlineAsmLength(const char *Str,
+ const MCAsmInfo &MAI) const override;
+
+ /// Allocate and return a hazard recognizer to use for this target when
+ /// scheduling the machine instructions after register allocation.
+ ScheduleHazardRecognizer*
+ CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
+ const ScheduleDAG *DAG) const override;
+
+ /// For a comparison instruction, return the source registers
+ /// in SrcReg and SrcReg2 if having two register operands, and the value it
+ /// compares against in CmpValue. Return true if the comparison instruction
+ /// can be analyzed.
+ bool analyzeCompare(const MachineInstr *MI,
+ unsigned &SrcReg, unsigned &SrcReg2,
+ int &Mask, int &Value) const override;
+
+ /// Compute the instruction latency of a given instruction.
+ /// If the instruction has higher cost when predicated, it's returned via
+ /// PredCost.
+ unsigned getInstrLatency(const InstrItineraryData *ItinData,
+ const MachineInstr *MI,
+ unsigned *PredCost = 0) const override;
+
+ /// Create machine specific model for scheduling.
DFAPacketizer *
CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override;
- bool isSchedulingBoundary(const MachineInstr *MI,
- const MachineBasicBlock *MBB,
- const MachineFunction &MF) const override;
- bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
- bool isValidAutoIncImm(const EVT VT, const int Offset) const;
- bool isMemOp(const MachineInstr *MI) const;
- bool isSpillPredRegOp(const MachineInstr *MI) const;
- bool isU6_3Immediate(const int value) const;
- bool isU6_2Immediate(const int value) const;
- bool isU6_1Immediate(const int value) const;
- bool isU6_0Immediate(const int value) const;
- bool isS4_3Immediate(const int value) const;
- bool isS4_2Immediate(const int value) const;
- bool isS4_1Immediate(const int value) const;
- bool isS4_0Immediate(const int value) const;
- bool isS12_Immediate(const int value) const;
- bool isU6_Immediate(const int value) const;
- bool isS8_Immediate(const int value) const;
- bool isS6_Immediate(const int value) const;
-
- bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
- bool isConditionalTransfer(const MachineInstr* MI) const;
+ // Sometimes, it is possible for the target
+ // to tell, even without aliasing information, that two MIs access different
+ // memory addresses. This function returns true if two MIs access different
+ // memory addresses and false otherwise.
+ bool areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+ AliasAnalysis *AA = nullptr)
+ const override;
+
+
+ /// HexagonInstrInfo specifics.
+ ///
+
+ const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+ unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+ bool isAbsoluteSet(const MachineInstr* MI) const;
+ bool isAccumulator(const MachineInstr *MI) const;
+ bool isComplex(const MachineInstr *MI) const;
+ bool isCompoundBranchInstr(const MachineInstr *MI) const;
+ bool isCondInst(const MachineInstr *MI) const;
bool isConditionalALU32 (const MachineInstr* MI) const;
- bool isConditionalLoad (const MachineInstr* MI) const;
+ bool isConditionalLoad(const MachineInstr* MI) const;
bool isConditionalStore(const MachineInstr* MI) const;
- bool isNewValueInst(const MachineInstr* MI) const;
- bool isNewValue(const MachineInstr* MI) const;
- bool isNewValue(Opcode_t Opcode) const;
- bool isDotNewInst(const MachineInstr* MI) const;
- int GetDotOldOp(const int opc) const;
- int GetDotNewOp(const MachineInstr* MI) const;
- int GetDotNewPredOp(MachineInstr *MI,
- const MachineBranchProbabilityInfo
- *MBPI) const;
- bool mayBeNewStore(const MachineInstr* MI) const;
+ bool isConditionalTransfer(const MachineInstr* MI) const;
+ bool isConstExtended(const MachineInstr *MI) const;
bool isDeallocRet(const MachineInstr *MI) const;
- unsigned getInvertedPredicatedOpcode(const int Opc) const;
+ bool isDependent(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const;
+ bool isDotCurInst(const MachineInstr* MI) const;
+ bool isDotNewInst(const MachineInstr* MI) const;
+ bool isDuplexPair(const MachineInstr *MIa, const MachineInstr *MIb) const;
+ bool isEarlySourceInstr(const MachineInstr *MI) const;
+ bool isEndLoopN(unsigned Opcode) const;
+ bool isExpr(unsigned OpType) const;
bool isExtendable(const MachineInstr* MI) const;
bool isExtended(const MachineInstr* MI) const;
- bool isPostIncrement(const MachineInstr* MI) const;
+ bool isFloat(const MachineInstr *MI) const;
+ bool isHVXMemWithAIndirect(const MachineInstr *I,
+ const MachineInstr *J) const;
+ bool isIndirectCall(const MachineInstr *MI) const;
+ bool isIndirectL4Return(const MachineInstr *MI) const;
+ bool isJumpR(const MachineInstr *MI) const;
+ bool isJumpWithinBranchRange(const MachineInstr *MI, unsigned offset) const;
+ bool isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI,
+ const MachineInstr *ESMI) const;
+ bool isLateResultInstr(const MachineInstr *MI) const;
+ bool isLateSourceInstr(const MachineInstr *MI) const;
+ bool isLoopN(const MachineInstr *MI) const;
+ bool isMemOp(const MachineInstr *MI) const;
+ bool isNewValue(const MachineInstr* MI) const;
+ bool isNewValue(unsigned Opcode) const;
+ bool isNewValueInst(const MachineInstr* MI) const;
+ bool isNewValueJump(const MachineInstr* MI) const;
+ bool isNewValueJump(unsigned Opcode) const;
bool isNewValueStore(const MachineInstr* MI) const;
bool isNewValueStore(unsigned Opcode) const;
- bool isNewValueJump(const MachineInstr* MI) const;
- bool isNewValueJump(Opcode_t Opcode) const;
- bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+ bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const;
+ bool isPostIncrement(const MachineInstr* MI) const;
+ bool isPredicatedNew(const MachineInstr *MI) const;
+ bool isPredicatedNew(unsigned Opcode) const;
+ bool isPredicatedTrue(const MachineInstr *MI) const;
+ bool isPredicatedTrue(unsigned Opcode) const;
+ bool isPredicated(unsigned Opcode) const;
+ bool isPredicateLate(unsigned Opcode) const;
+ bool isPredictedTaken(unsigned Opcode) const;
+ bool isSaveCalleeSavedRegsCall(const MachineInstr *MI) const;
+ bool isSolo(const MachineInstr* MI) const;
+ bool isSpillPredRegOp(const MachineInstr *MI) const;
+ bool isTC1(const MachineInstr *MI) const;
+ bool isTC2(const MachineInstr *MI) const;
+ bool isTC2Early(const MachineInstr *MI) const;
+ bool isTC4x(const MachineInstr *MI) const;
+ bool isV60VectorInstruction(const MachineInstr *MI) const;
+ bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+ bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
+ bool isVecAcc(const MachineInstr *MI) const;
+ bool isVecALU(const MachineInstr *MI) const;
+ bool isVecUsableNextPacket(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const;
+
+
+ bool canExecuteInBundle(const MachineInstr *First,
+ const MachineInstr *Second) const;
+ bool hasEHLabel(const MachineBasicBlock *B) const;
+ bool hasNonExtEquivalent(const MachineInstr *MI) const;
+ bool hasPseudoInstrPair(const MachineInstr *MI) const;
+ bool hasUncondBranch(const MachineBasicBlock *B) const;
+ bool mayBeCurLoad(const MachineInstr* MI) const;
+ bool mayBeNewStore(const MachineInstr* MI) const;
+ bool producesStall(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) const;
+ bool producesStall(const MachineInstr *MI,
+ MachineBasicBlock::const_instr_iterator MII) const;
+ bool predCanBeUsedAsDotNew(const MachineInstr *MI, unsigned PredReg) const;
+ bool PredOpcodeHasJMP_c(unsigned Opcode) const;
+ bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
- void immediateExtend(MachineInstr *MI) const;
- bool isConstExtended(const MachineInstr *MI) const;
- unsigned getSize(const MachineInstr *MI) const;
- int getDotNewPredJumpOp(MachineInstr *MI,
- const MachineBranchProbabilityInfo *MBPI) const;
unsigned getAddrMode(const MachineInstr* MI) const;
- bool isOperandExtended(const MachineInstr *MI,
- unsigned short OperandNum) const;
- unsigned short getCExtOpNum(const MachineInstr *MI) const;
- int getMinValue(const MachineInstr *MI) const;
+ unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset,
+ unsigned &AccessSize) const;
+ bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
+ unsigned &OffsetPos) const;
+ SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
+ unsigned getCExtOpNum(const MachineInstr *MI) const;
+ HexagonII::CompoundGroup
+ getCompoundCandidateGroup(const MachineInstr *MI) const;
+ unsigned getCompoundOpcode(const MachineInstr *GA,
+ const MachineInstr *GB) const;
+ int getCondOpcode(int Opc, bool sense) const;
+ int getDotCurOp(const MachineInstr* MI) const;
+ int getDotNewOp(const MachineInstr* MI) const;
+ int getDotNewPredJumpOp(const MachineInstr *MI,
+ const MachineBranchProbabilityInfo *MBPI) const;
+ int getDotNewPredOp(const MachineInstr *MI,
+ const MachineBranchProbabilityInfo *MBPI) const;
+ int getDotOldOp(const int opc) const;
+ HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr *MI)
+ const;
+ short getEquivalentHWInstr(const MachineInstr *MI) const;
+ MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const;
+ unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData,
+ const MachineInstr *MI) const;
+ bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const;
+ unsigned getInvertedPredicatedOpcode(const int Opc) const;
int getMaxValue(const MachineInstr *MI) const;
- bool NonExtEquivalentExists (const MachineInstr *MI) const;
+ unsigned getMemAccessSize(const MachineInstr* MI) const;
+ int getMinValue(const MachineInstr *MI) const;
short getNonExtOpcode(const MachineInstr *MI) const;
- bool PredOpcodeHasJMP_c(Opcode_t Opcode) const;
- bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
- bool isEndLoopN(Opcode_t Opcode) const;
bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg,
unsigned &PredRegPos, unsigned &PredRegFlags) const;
- int getCondOpcode(int Opc, bool sense) const;
+ short getPseudoInstrPair(const MachineInstr *MI) const;
+ short getRegForm(const MachineInstr *MI) const;
+ unsigned getSize(const MachineInstr *MI) const;
+ uint64_t getType(const MachineInstr* MI) const;
+ unsigned getUnits(const MachineInstr* MI) const;
+ unsigned getValidSubTargets(const unsigned Opcode) const;
+
+ /// getInstrTimingClassLatency - Compute the instruction latency of a given
+ /// instruction using Timing Class information, if available.
+ unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
+ unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;
+
+
+ void immediateExtend(MachineInstr *MI) const;
+ bool invertAndChangeJumpTarget(MachineInstr* MI,
+ MachineBasicBlock* NewTarget) const;
+ void genAllInsnTimingClasses(MachineFunction &MF) const;
+ bool reversePredSense(MachineInstr* MI) const;
+ unsigned reversePrediction(unsigned Opcode) const;
+ bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const;
};
}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 3b32c10ed5b0..5cfeba720d90 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -13,7 +13,7 @@
include "HexagonInstrFormats.td"
include "HexagonOperands.td"
-
+include "HexagonInstrEnc.td"
// Pattern fragment that combines the value type and the register class
// into a single parameter.
// The pat frags in the definitions below need to have a named register,
@@ -1426,9 +1426,6 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
-def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
-
class CondStr<string CReg, bit True, bit New> {
string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
}
@@ -1606,8 +1603,6 @@ def EH_RETURN_JMPR : T_JMPr;
def: Pat<(eh_return),
(EH_RETURN_JMPR (i32 R31))>;
-def: Pat<(HexagonBR_JT (i32 IntRegs:$dst)),
- (J2_jumpr IntRegs:$dst)>;
def: Pat<(brind (i32 IntRegs:$dst)),
(J2_jumpr IntRegs:$dst)>;
@@ -2825,7 +2820,7 @@ let CextOpcode = "ADD_acc" in {
let isExtentSigned = 1 in
def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext,
[(set (i32 IntRegs:$dst),
- (add (add (i32 IntRegs:$src2), s16_16ImmPred:$src3),
+ (add (add (i32 IntRegs:$src2), s32ImmPred:$src3),
(i32 IntRegs:$src1)))]>, ImmRegRel;
def M2_acci : T_MType_acc_rr <"+= add", 0b000, 0b001, 0,
@@ -2859,7 +2854,7 @@ class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp>
def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>;
def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32ImmPred>;
-def : T_MType_acc_pat1 <M2_naccii, add, sub, s16_16ImmPred>;
+def : T_MType_acc_pat1 <M2_naccii, add, sub, s32ImmPred>;
def : T_MType_acc_pat2 <M2_nacci, add, sub>;
//===----------------------------------------------------------------------===//
@@ -3303,7 +3298,8 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
!if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
!if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
/* s4_0Imm */ offset{3-0})));
- let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
let IClass = 0b1010;
@@ -3322,7 +3318,7 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
//===----------------------------------------------------------------------===//
let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
- bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew >
+ bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
: STInst <(outs IntRegs:$_dst_),
(ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
!if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
@@ -3341,7 +3337,8 @@ class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
!if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
/* s4_0Imm */ offset{3-0})));
- let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
let isPredicatedNew = isPredNew;
let isPredicatedFalse = isPredNot;
@@ -3404,7 +3401,6 @@ def: Storepi_pat<post_store, I64, s4_3ImmPred, S2_storerd_pi>;
//===----------------------------------------------------------------------===//
// Template class for post increment stores with register offset.
//===----------------------------------------------------------------------===//
-let isNVStorable = 1 in
class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
MemAccessSize AccessSz, bit isHalf = 0>
: STInst <(outs IntRegs:$_dst_),
@@ -3416,6 +3412,9 @@ class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
bits<5> src3;
let accessSize = AccessSz;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
+
let IClass = 0b1010;
let Inst{27-24} = 0b1101;
@@ -3430,12 +3429,11 @@ def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
-
def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
- bits<3>MajOp, bit isH = 0>
+ bits<3> MajOp, bit isH = 0>
: STInst <(outs),
(ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
@@ -3455,6 +3453,8 @@ class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
!if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
!if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
/* s11_0Ext */ src2{10-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
let IClass = 0b1010;
let Inst{27} = 0b0;
@@ -3494,7 +3494,10 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
!if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
!if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
/* u6_0Ext */ src3{5-0})));
- let IClass = 0b0100;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+ let IClass = 0b0100;
let Inst{27} = 0b0;
let Inst{26} = PredNot;
@@ -3508,7 +3511,7 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
let Inst{1-0} = src1;
}
-let isExtendable = 1, isNVStorable = 1, hasSideEffects = 0 in
+let isExtendable = 1, hasSideEffects = 0 in
multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
@@ -3665,7 +3668,7 @@ def S2_allocframe: ST0Inst <
// S2_storer[bhwdf]_pci: Store byte/half/word/double.
// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS], isNVStorable = 1 in
+let Uses = [CS] in
class T_store_pci <string mnemonic, RegisterClass RC,
Operand Imm, bits<4>MajOp,
MemAccessSize AlignSize, string RegSrc = "Rt">
@@ -3679,6 +3682,8 @@ class T_store_pci <string mnemonic, RegisterClass RC,
bits<1> Mu;
bits<5> Rt;
let accessSize = AlignSize;
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+ !if(!eq(RegSrc,"Rt.h"), 0, 1));
let IClass = 0b1010;
let Inst{27-25} = 0b100;
@@ -3696,15 +3701,15 @@ class T_store_pci <string mnemonic, RegisterClass RC,
}
def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
- ByteAccess>;
+ ByteAccess>;
def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
- HalfWordAccess>;
+ HalfWordAccess>;
def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
- HalfWordAccess, "Rt.h">;
+ HalfWordAccess, "Rt.h">;
def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
- WordAccess>;
+ WordAccess>;
def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
- DoubleWordAccess>;
+ DoubleWordAccess>;
let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in
class T_storenew_pci <string mnemonic, Operand Imm,
@@ -3762,7 +3767,7 @@ def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>;
//===----------------------------------------------------------------------===//
// Circular stores with auto-increment register
//===----------------------------------------------------------------------===//
-let Uses = [CS], isNVStorable = 1 in
+let Uses = [CS] in
class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
MemAccessSize AlignSize, string RegSrc = "Rt">
: STInst <(outs IntRegs:$_dst_),
@@ -3775,6 +3780,8 @@ class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
bits<5> Rt;
let accessSize = AlignSize;
+ let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+ !if(!eq(RegSrc,"Rt.h"), 0, 1));
let IClass = 0b1010;
let Inst{27-25} = 0b100;
@@ -5784,7 +5791,19 @@ include "HexagonInstrInfoV5.td"
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// V60 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
// ALU32/64/Vector +
//===----------------------------------------------------------------------===///
include "HexagonInstrInfoVector.td"
+
+include "HexagonInstrAlias.td"
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 65b0f4974367..87d6b359f5fb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -684,7 +684,7 @@ def: Pat<(i64 (zext (i32 IntRegs:$src1))),
// Template class for store instructions with Absolute set addressing mode.
//===----------------------------------------------------------------------===//
let isExtended = 1, opExtendable = 1, opExtentBits = 6,
- addrMode = AbsoluteSet, isNVStorable = 1 in
+ addrMode = AbsoluteSet in
class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
: STInst<(outs IntRegs:$dst),
@@ -696,6 +696,9 @@ class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
let accessSize = AccessSz;
let BaseOpcode = BaseOp#"_AbsSet";
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
let IClass = 0b1010;
let Inst{27-24} = 0b1011;
@@ -750,7 +753,7 @@ let mayStore = 1, addrMode = AbsoluteSet in {
}
let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
-addrMode = BaseLongOffset, AddedComplexity = 40 in
+ addrMode = BaseLongOffset, AddedComplexity = 40 in
class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
: STInst<(outs),
@@ -766,6 +769,10 @@ class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
let accessSize = AccessSz;
let CextOpcode = CextOp;
let BaseOpcode = CextOp#"_shl";
+
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
let IClass = 0b1010;
let Inst{27-24} =0b1101;
@@ -856,6 +863,9 @@ class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
bits<2> u2;
bits<5> Rt;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
let IClass = 0b0011;
let Inst{27-24} = 0b1011;
@@ -888,6 +898,8 @@ class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
let isPredicatedFalse = isNot;
let isPredicatedNew = isPredNew;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
let IClass = 0b0011;
@@ -1826,43 +1838,22 @@ def: LogLogNot_pat<or, or, C4_or_orn>;
// below are needed to support code generation for PIC
//===----------------------------------------------------------------------===//
-def SDT_HexagonPICAdd
+def SDT_HexagonAtGot
+ : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def SDT_HexagonAtPcrel
: SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_HexagonGOTAdd
- : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-def SDT_HexagonGOTAddInternal : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-def SDT_HexagonGOTAddInternalJT : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-def SDT_HexagonGOTAddInternalBA : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
-
-def Hexagonpic_add : SDNode<"HexagonISD::PIC_ADD", SDT_HexagonPICAdd>;
-def Hexagonat_got : SDNode<"HexagonISD::AT_GOT", SDT_HexagonGOTAdd>;
-def Hexagongat_pcrel : SDNode<"HexagonISD::AT_PCREL",
- SDT_HexagonGOTAddInternal>;
-def Hexagongat_pcrel_jt : SDNode<"HexagonISD::AT_PCREL",
- SDT_HexagonGOTAddInternalJT>;
-def Hexagongat_pcrel_ba : SDNode<"HexagonISD::AT_PCREL",
- SDT_HexagonGOTAddInternalBA>;
-
-// PIC: Map from a block address computation to a PC-relative add
-def: Pat<(Hexagongat_pcrel_ba tblockaddress:$src1),
- (C4_addipc u32ImmPred:$src1)>;
-
-// PIC: Map from the computation to generate a GOT pointer to a PC-relative add
-def: Pat<(Hexagonpic_add texternalsym:$src1),
- (C4_addipc u32ImmPred:$src1)>;
-// PIC: Map from a jump table address computation to a PC-relative add
-def: Pat<(Hexagongat_pcrel_jt tjumptable:$src1),
- (C4_addipc u32ImmPred:$src1)>;
+// AT_GOT address-of-GOT, address-of-global, offset-in-global
+def HexagonAtGot : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
+// AT_PCREL address-of-global
+def HexagonAtPcrel : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
-// PIC: Map from a GOT-relative symbol reference to a load
-def: Pat<(Hexagonat_got (i32 IntRegs:$src1), tglobaladdr:$src2),
- (L2_loadri_io IntRegs:$src1, s30_2ImmPred:$src2)>;
-
-// PIC: Map from a static symbol reference to a PC-relative add
-def: Pat<(Hexagongat_pcrel tglobaladdr:$src1),
- (C4_addipc u32ImmPred:$src1)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
+ (L2_loadri_io I32:$got, imm:$addr)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
+ (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
+def: Pat<(HexagonAtPcrel I32:$addr),
+ (C4_addipc imm:$addr)>;
//===----------------------------------------------------------------------===//
// CR -
@@ -1903,7 +1894,7 @@ def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
(ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6),
"$Rd = add($Rs, add($Ru, #$s6))" ,
[(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs),
- (add (i32 IntRegs:$Ru), s16_16ImmPred:$s6)))],
+ (add (i32 IntRegs:$Ru), s32ImmPred:$s6)))],
"", ALU64_tc_2_SLOT23> {
bits<5> Rd;
bits<5> Rs;
@@ -3290,27 +3281,33 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
}
// Restore registers and dealloc frame before a tail call.
let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+ let isExtended = 1, opExtendable = 0 in
+ def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
}
// Save registers function call.
let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+ let isExtended = 1, opExtendable = 0 in
+ def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
}
//===----------------------------------------------------------------------===//
// Template class for non predicated store instructions with
// GP-Relative or absolute addressing.
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1, isNVStorable = 1 in
+let hasSideEffects = 0, isPredicable = 1 in
class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
- bits<2>MajOp, Operand AddrOp, bit isAbs, bit isHalf>
- : STInst<(outs), (ins AddrOp:$addr, RC:$src),
- mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src"#!if(isHalf, ".h",""),
+ bits<2>MajOp, bit isAbs, bit isHalf>
+ : STInst<(outs), (ins ImmOp:$addr, RC:$src),
+ mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
[], "", V2LDST_tc_st_SLOT01> {
bits<19> addr;
bits<5> src;
@@ -3321,6 +3318,9 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
!if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
!if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
/* u16_0Imm */ addr{15-0})));
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
let IClass = 0b0100;
let Inst{27} = 1;
let Inst{26-25} = offsetBits{15-14};
@@ -3337,11 +3337,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
// Template class for predicated store instructions with
// GP-Relative or absolute addressing.
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, isNVStorable = 1, opExtentBits = 6,
- opExtendable = 1 in
+let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
bit isHalf, bit isNot, bit isNew>
- : STInst<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, RC: $src2),
+ : STInst<(outs), (ins PredRegs:$src1, u32MustExt:$absaddr, RC: $src2),
!if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
[], "", ST_tc_st_SLOT01>, AddrModeRel {
@@ -3351,6 +3350,8 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
let isPredicatedNew = isNew;
let isPredicatedFalse = isNot;
+ // Store upper-half and store doubleword cannot be NV.
+ let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
let IClass = 0b1010;
@@ -3371,7 +3372,7 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
//===----------------------------------------------------------------------===//
class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
bits<2> MajOp, bit isHalf>
- : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1, isHalf>,
+ : T_StoreAbsGP <mnemonic, RC, u32MustExt, MajOp, 1, isHalf>,
AddrModeRel {
string ImmOpStr = !cast<string>(ImmOp);
let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
@@ -3538,7 +3539,7 @@ defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
let isAsmParserOnly = 1 in
class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
- : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0, isHalf> {
+ : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
// Set BaseOpcode same as absolute addressing instructions so that
// non-predicated GP-Rel instructions can have relate with predicated
// Absolute instruction.
@@ -3553,7 +3554,7 @@ multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
// Absolute instruction.
let BaseOpcode = BaseOp#_abs in {
def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
- globaladdress, 0, isHalf>;
+ 0, isHalf>;
// New-value store
def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ;
}
@@ -3615,9 +3616,9 @@ let AddedComplexity = 100 in {
//===----------------------------------------------------------------------===//
let isPredicable = 1, hasSideEffects = 0 in
class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
- bits<3> MajOp, Operand AddrOp, bit isAbs>
- : LDInst <(outs RC:$dst), (ins AddrOp:$addr),
- "$dst = "#mnemonic# !if(isAbs, "(##", "(#")#"$addr)",
+ bits<3> MajOp>
+ : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
+ "$dst = "#mnemonic# "(#$addr)",
[], "", V2LDST_tc_ld_SLOT01> {
bits<5> dst;
bits<19> addr;
@@ -3642,7 +3643,7 @@ class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
bits<3> MajOp>
- : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1>, AddrModeRel {
+ : T_LoadAbsGP <mnemonic, RC, u32MustExt, MajOp>, AddrModeRel {
string ImmOpStr = !cast<string>(ImmOp);
let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
@@ -3660,10 +3661,11 @@ class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
// Template class for predicated load instructions with
// absolute addressing mode.
//===----------------------------------------------------------------------===//
-let isPredicated = 1, opExtentBits = 6, opExtendable = 2 in
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
+ opExtendable = 2 in
class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
bit isPredNot, bit isPredNew>
- : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u6Ext:$absaddr),
+ : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32MustExt:$absaddr),
!if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
bits<5> dst;
@@ -3737,7 +3739,7 @@ defm loadrd : LD_Abs<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
let isAsmParserOnly = 1 in
class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
bits<3> MajOp>
- : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0>, PredNewRel {
+ : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
let BaseOpcode = BaseOp#_abs;
}
@@ -3841,26 +3843,6 @@ let AddedComplexity = 100 in {
def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
}
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
-let AddedComplexity = 100 in
-def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
- (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>;
-
-// Transfer global address into a register
-let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
-isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
-def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
- "$dst = #$src1",
- [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>;
-
-// Transfer a block address into a register
-def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
- (TFRI_V4 tblockaddress:$src1)>;
-
-let AddedComplexity = 50 in
-def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
- (TFRI_V4 tglobaladdr:$src1)>;
-
// i8/i16/i32 -> i64 loads
// We need a complexity of 120 here to override preceding handling of
// zextload.
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
index 337f4ea2184a..823961fb6e6f 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -98,21 +98,21 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
// HexagonInstrInfo.td patterns.
let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
- isCodeGenOnly = 1 in
+ isCodeGenOnly = 1, isPseudo = 1 in
def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
"$dst = #$src1",
[(set F32:$dst, fpimm:$src1)]>,
Requires<[HasV5T]>;
-let isExtended = 1, opExtendable = 2, isPredicated = 1,
- hasSideEffects = 0, validSubTargets = HasV5SubT, isCodeGenOnly = 1 in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, hasSideEffects = 0,
+ validSubTargets = HasV5SubT, isCodeGenOnly = 1, isPseudo = 1 in
def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
(ins PredRegs:$src1, f32Ext:$src2),
"if ($src1) $dst = #$src2", []>,
Requires<[HasV5T]>;
-let isPseudo = 1, isExtended = 1, opExtendable = 2, isPredicated = 1,
- isPredicatedFalse = 1, hasSideEffects = 0, validSubTargets = HasV5SubT in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
+ hasSideEffects = 0, validSubTargets = HasV5SubT, isPseudo = 1 in
def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
(ins PredRegs:$src1, f32Ext:$src2),
"if (!$src1) $dst = #$src2", []>,
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
new file mode 100644
index 000000000000..897ada081534
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -0,0 +1,2241 @@
+//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Vector store
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+{
+ class VSTInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VM_ST,
+ IType type = TypeCVI_VM_ST>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>, OpcodeHexagon;
+
+}
+
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+ class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VM_LD,
+ IType type = TypeCVI_VM_LD>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VM_ST,
+ IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+//===----------------------------------------------------------------------===//
+// Vector loads with base + immediate offset
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vload_ai<string asmStr>
+ : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
+ asmStr>;
+
+let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
+class T_vload_ai_128B<string asmStr>
+ : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
+ asmStr>;
+
+let isCVLoadable = 1, hasNewValue = 1 in {
+ def V6_vL32b_ai : T_vload_ai <"$dst = vmem($src1+#$src2)">,
+ V6_vL32b_ai_enc;
+ def V6_vL32b_nt_ai : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_ai_enc;
+ // 128B
+ def V6_vL32b_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
+ V6_vL32b_ai_128B_enc;
+ def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
+ def V6_vL32Ub_ai : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
+ V6_vL32Ub_ai_enc;
+ def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
+ V6_vL32Ub_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
+ hasNewValue = 1 in {
+ def V6_vL32b_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
+ V6_vL32b_cur_ai_enc;
+ def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_cur_ai_enc;
+ // 128B
+ def V6_vL32b_cur_ai_128B : T_vload_ai_128B
+ <"$dst.cur = vmem($src1+#$src2)">,
+ V6_vL32b_cur_ai_128B_enc;
+ def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
+ <"$dst.cur = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_cur_ai_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
+ def V6_vL32b_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_tmp_ai_enc;
+ def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
+ V6_vL32b_nt_tmp_ai_enc;
+ // 128B
+ def V6_vL32b_tmp_ai_128B : T_vload_ai_128B
+ <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_tmp_ai_128B_enc;
+ def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
+ <"$dst.tmp = vmem($src1+#$src2)">,
+ V6_vL32b_nt_tmp_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isNT>
+ : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_ai : T_vstore_ai_64B <"vmem", "vS32b_ai">,
+ V6_vS32b_ai_enc;
+ def V6_vS32b_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai">,
+ V6_vS32b_ai_128B_enc;
+}
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_ai : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_nt_ai_enc;
+ def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_ai : T_vstore_ai_64B <"vmemu", "vs32Ub_ai">,
+ V6_vS32Ub_ai_enc;
+ def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vs32Ub_ai">,
+ V6_vS32Ub_ai_128B_enc;
+}
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
+ Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
+class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+ : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
+ : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
+ : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+def V6_vS32b_new_ai : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
+def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
+ V6_vS32b_new_ai_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_ai : T_vstore_new_ai_64B<"vS32b_ai", 1>,
+ V6_vS32b_nt_new_ai_enc;
+ def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
+ V6_vS32b_nt_new_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1 in
+class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "
+ #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
+ V6_vS32b_pred_ai_enc;
+ def V6_vS32b_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_npred_ai_enc;
+ // 128B
+ def V6_vS32b_pred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
+ V6_vS32b_pred_ai_128B_enc;
+ def V6_vS32b_npred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
+ V6_vS32b_npred_ai_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
+ V6_vS32b_nt_pred_ai_enc;
+ def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
+ V6_vS32b_nt_npred_ai_enc;
+ // 128B
+ def V6_vS32b_nt_pred_ai_128B : T_vstore_pred_ai_128B
+ <"vmem", "vS32b_ai", 0, 1>,
+ V6_vS32b_nt_pred_ai_128B_enc;
+ def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
+ <"vmem", "vS32b_ai", 1, 1>,
+ V6_vS32b_nt_npred_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_pred_ai_enc;
+ def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
+ V6_vS32Ub_npred_ai_enc;
+ // 128B
+ def V6_vS32Ub_pred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
+ V6_vS32Ub_pred_ai_128B_enc;
+ def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
+ V6_vS32Ub_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset in
+class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
+ bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_ai : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
+def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
+ V6_vS32b_nqpred_ai_enc;
+def V6_vS32b_nt_qpred_ai : T_vstore_qpred_ai_64B <0, 1>,
+ V6_vS32b_nt_qpred_ai_enc;
+def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
+ V6_vS32b_nt_nqpred_ai_enc;
+// 128B
+def V6_vS32b_qpred_ai_128B : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
+def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
+ V6_vS32b_nqpred_ai_128B_enc;
+def V6_vS32b_nt_qpred_ai_128B : T_vstore_qpred_ai_128B<0, 1>,
+ V6_vS32b_nt_qpred_ai_128B_enc;
+def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
+ V6_vS32b_nt_nqpred_ai_128B_enc;
+
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
+ isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
+class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
+ bit isPredNot, bit isNT>
+ : V6_STInst <(outs),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+
+def V6_vS32b_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai">,
+ V6_vS32b_new_pred_ai_enc;
+def V6_vS32b_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
+ V6_vS32b_new_npred_ai_enc;
+// 128B
+def V6_vS32b_new_pred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai">,
+ V6_vS32b_new_pred_ai_128B_enc;
+def V6_vS32b_new_npred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
+ V6_vS32b_new_npred_ai_128B_enc;
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
+ V6_vS32b_nt_new_pred_ai_enc;
+ def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
+ V6_vS32b_nt_new_npred_ai_enc;
+ // 128B
+ def V6_vS32b_nt_new_pred_ai_128B : T_vstore_new_pred_ai_128B
+ <"vS32b_ai", 0, 1>,
+ V6_vS32b_nt_new_pred_ai_128B_enc;
+ def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
+ <"vS32b_ai", 1, 1>,
+ V6_vS32b_nt_new_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, hasNewValue = 1 in
+class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
+ : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
+ "$src1 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vload_pi_64B <string asmStr>
+ : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vload_pi_128B <string asmStr>
+ : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
+
+let isCVLoadable = 1 in {
+ def V6_vL32b_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
+ V6_vL32b_pi_enc;
+ def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_pi_enc;
+ // 128B
+ def V6_vL32b_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
+ V6_vL32b_pi_128B_enc;
+ def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
+ def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
+ V6_vL32Ub_pi_enc;
+ // 128B
+ def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
+ V6_vL32Ub_pi_128B_enc;
+}
+
+let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
+ def V6_vL32b_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
+ V6_vL32b_cur_pi_enc;
+ def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_cur_pi_enc;
+ // 128B
+ def V6_vL32b_cur_pi_128B : T_vload_pi_128B
+ <"$dst.cur = vmem($src1++#$src2)">,
+ V6_vL32b_cur_pi_128B_enc;
+ def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
+ <"$dst.cur = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_cur_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+ def V6_vL32b_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
+ V6_vL32b_tmp_pi_enc;
+ def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_tmp_pi_enc;
+ //128B
+ def V6_vL32b_tmp_pi_128B : T_vload_pi_128B
+ <"$dst.tmp = vmem($src1++#$src2)">,
+ V6_vL32b_tmp_pi_128B_enc;
+ def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
+ <"$dst.tmp = vmem($src1++#$src2):nt">,
+ V6_vL32b_nt_tmp_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let accessSize = Vector64Access in
+class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
+ : T_vstore_pi <mnemonic, baseOp, s3_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pi : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
+ def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
+ V6_vS32b_pi_128B_enc;
+}
+
+let isNVStorable = 1 , isNonTemporal = 1 in {
+ def V6_vS32b_nt_pi : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_nt_pi_enc;
+ def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_nt_pi_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pi : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pi_enc;
+ def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment unconditional .new vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isNVStore = 1 in
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+ opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+ "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+ "$src1 = $_dst_">, NewValueRel {
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
+ : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
+ : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+
+def V6_vS32b_new_pi : T_vstore_new_pi_64B <"vS32b_pi">,
+ V6_vS32b_new_pi_enc;
+def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
+ V6_vS32b_new_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pi : T_vstore_new_pi_64B <"vS32b_pi", 1>,
+ V6_vS32b_nt_new_pi_enc;
+ def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
+ V6_vS32b_nt_new_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, addrMode = PostInc in
+class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
+ RegisterClass RC, bit isPredNot, bit isNT>
+ : V6_STInst<(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
+ bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+ def V6_vS32b_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
+ V6_vS32b_pred_pi_enc;
+ def V6_vS32b_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_npred_pi_enc;
+ // 128B
+ def V6_vS32b_pred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
+ V6_vS32b_pred_pi_128B_enc;
+ def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
+ V6_vS32b_npred_pi_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
+ V6_vS32b_nt_pred_pi_enc;
+ def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
+ V6_vS32b_nt_npred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_pred_pi_128B : T_vstore_pred_pi_128B
+ <"vmem", "vS32b_pi", 0, 1>,
+ V6_vS32b_nt_pred_pi_128B_enc;
+ def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
+ <"vmem", "vS32b_pi", 1, 1>,
+ V6_vS32b_nt_npred_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pred_pi_enc;
+ def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
+ V6_vS32Ub_npred_pi_enc;
+ // 128B
+ def V6_vS32Ub_pred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
+ V6_vS32Ub_pred_pi_128B_enc;
+ def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
+ V6_vS32Ub_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
+ bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_pi : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
+def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
+// 128B
+def V6_vS32b_qpred_pi_128B : T_vstore_qpred_pi_128B,
+ V6_vS32b_qpred_pi_128B_enc;
+def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
+ V6_vS32b_nqpred_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_qpred_pi : T_vstore_qpred_pi_64B <0, 1>,
+ V6_vS32b_nt_qpred_pi_enc;
+ def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
+ V6_vS32b_nt_nqpred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_qpred_pi_128B : T_vstore_qpred_pi_128B<0, 1>,
+ V6_vS32b_nt_qpred_pi_128B_enc;
+ def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
+ V6_vS32b_nt_nqpred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+ isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
+class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
+ bit isPredNot, bit isNT>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new", [],
+ "$src2 = $_dst_"> , NewValueRel {
+ let isPredicatedFalse = isPredNot;
+ let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+ : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
+ isPredNot, isNT>;
+
+def V6_vS32b_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi">,
+ V6_vS32b_new_pred_pi_enc;
+def V6_vS32b_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
+ V6_vS32b_new_npred_pi_enc;
+// 128B
+def V6_vS32b_new_pred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi">,
+ V6_vS32b_new_pred_pi_128B_enc;
+def V6_vS32b_new_npred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
+ V6_vS32b_new_npred_pi_128B_enc;
+let isNonTemporal = 1 in {
+ def V6_vS32b_nt_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
+ V6_vS32b_nt_new_pred_pi_enc;
+ def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
+ V6_vS32b_nt_new_npred_pi_enc;
+ // 128B
+ def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
+ <"vS32b_pi", 0, 1>,
+ V6_vS32b_nt_new_pred_pi_128B_enc;
+ def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
+ <"vS32b_pi", 1, 1>,
+ V6_vS32b_nt_new_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with register offset
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1 in
+class T_vload_ppu<string asmStr>
+ : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let isCVLoadable = 1 in {
+ def V6_vL32b_ppu : T_vload_ppu <"$dst = vmem($src1++$src2)">,
+ V6_vL32b_ppu_enc;
+ def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_ppu_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
+def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
+ V6_vL32Ub_ppu_enc;
+
+let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
+ def V6_vL32b_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
+ V6_vL32b_cur_ppu_enc;
+ def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_cur_ppu_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+ def V6_vL32b_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
+ V6_vL32b_tmp_ppu_enc;
+ def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
+ V6_vL32b_nt_tmp_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset
+//===----------------------------------------------------------------------===//
+class T_vstore_ppu <string mnemonic, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+ mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_ppu : T_vstore_ppu <"vmem">,
+ V6_vS32b_ppu_enc;
+ let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
+ def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
+ V6_vS32b_nt_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
+def V6_vS32Ub_ppu : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+ opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_ppu <bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+ "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+ "$src1 = $_dst_">, NewValueRel;
+
+let BaseOpcode = "vS32b_ppu" in
+def V6_vS32b_new_ppu : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
+def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst<(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
+ def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+ def V6_vS32b_nt_pred_ppu : T_vstore_pred_ppu <"vmem", 0, 1>,
+ V6_vS32b_nt_pred_ppu_enc;
+ def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
+ V6_vS32b_nt_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
+ Type = TypeCVI_VM_STU in {
+ def V6_vS32Ub_pred_ppu : T_vstore_pred_ppu <"vmemu">,
+ V6_vS32Ub_pred_ppu_enc;
+ def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
+ V6_vS32Ub_npred_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4", [],
+ "$src2 = $_dst_">, NewValueRel;
+
+def V6_vS32b_qpred_ppu : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
+def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
+def V6_vS32b_nt_qpred_ppu : T_vstore_qpred_ppu<0, 1>,
+ V6_vS32b_nt_qpred_ppu_enc;
+def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
+ V6_vS32b_nt_nqpred_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+ isNewValue = 1, opNewValue = 4, isNVStore = 1 in
+class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
+ : V6_STInst <(outs IntRegs:$_dst_),
+ (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+ "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+ #!if(isNT, ":nt", "")#" = $src4.new", [],
+ "$src2 = $_dst_">, NewValueRel {
+ let isPredicatedFalse = isPredNot;
+}
+
+let BaseOpcode = "vS32b_ppu" in {
+ def V6_vS32b_new_pred_ppu : T_vstore_new_pred_ppu,
+ V6_vS32b_new_pred_ppu_enc;
+ def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
+ V6_vS32b_new_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+def V6_vS32b_nt_new_pred_ppu : T_vstore_new_pred_ppu<0, 1>,
+ V6_vS32b_nt_new_pred_ppu_enc;
+def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
+ V6_vS32b_nt_new_npred_ppu_enc;
+}
+
+let isPseudo = 1, validSubTargets = HasV60SubT in
+class STrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>:
+ VSTInst<(outs), (ins IntRegs:$addr, ImmOp:$off, RC:$src),
+ #mnemonic#"($addr+#$off) = $src", []>;
+
+def STrivv_indexed: STrivv_template<"vvmem", s4_6Imm, VecDblRegs>,
+ Requires<[HasV60T, UseHVXSgl]>;
+def STrivv_indexed_128B: STrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>,
+ Requires<[HasV60T, UseHVXDbl]>;
+
+multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+ def : Pat<(store (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+ (STrivv_indexed IntRegs:$addr, #0, (VTSgl VecDblRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ def : Pat<(store (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+ (STrivv_indexed_128B IntRegs:$addr, #0,
+ (VTDbl VecDblRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : STrivv_pats <v128i8, v256i8>;
+defm : STrivv_pats <v64i16, v128i16>;
+defm : STrivv_pats <v32i32, v64i32>;
+defm : STrivv_pats <v16i64, v32i64>;
+
+
+multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+ // Aligned stores
+ def : Pat<(store (VTSgl VectorRegs:$src1), IntRegs:$addr),
+ (V6_vS32b_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ // 128B Aligned stores
+ def : Pat<(store (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+ (V6_vS32b_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+
+ // Fold Add R+IFF into vector store.
+ let AddedComplexity = 10 in
+ def : Pat<(store (VTSgl VectorRegs:$src1),
+ (add IntRegs:$src2, s4_6ImmPred:$offset)),
+ (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+ (VTSgl VectorRegs:$src1))>,
+ Requires<[UseHVXSgl]>;
+
+ // Fold Add R+IFF into vector store 128B.
+ let AddedComplexity = 10 in
+ def : Pat<(store (VTDbl VectorRegs128B:$src1),
+ (add IntRegs:$src2, s4_7ImmPred:$offset)),
+ (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+ (VTDbl VectorRegs128B:$src1))>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : vS32b_ai_pats <v64i8, v128i8>;
+defm : vS32b_ai_pats <v32i16, v64i16>;
+defm : vS32b_ai_pats <v16i32, v32i32>;
+defm : vS32b_ai_pats <v8i64, v16i64>;
+
+let isPseudo = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>
+ : V6_LDInst <(outs RC:$dst), (ins IntRegs:$addr, ImmOp:$off),
+ "$dst="#mnemonic#"($addr+#$off)",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+
+def LDrivv_indexed: LDrivv_template<"vvmem", s4_6Imm, VecDblRegs>;
+def LDrivv_indexed_128B: LDrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>;
+
+multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+ def : Pat < (VTSgl (load IntRegs:$addr)),
+ (LDrivv_indexed IntRegs:$addr, #0) >,
+ Requires<[UseHVXSgl]>;
+
+ def : Pat < (VTDbl (load IntRegs:$addr)),
+ (LDrivv_indexed_128B IntRegs:$addr, #0) >,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : LDrivv_pats <v128i8, v256i8>;
+defm : LDrivv_pats <v64i16, v128i16>;
+defm : LDrivv_pats <v32i32, v64i32>;
+defm : LDrivv_pats <v16i64, v32i64>;
+
+multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+ // Aligned loads
+ def : Pat < (VTSgl (load IntRegs:$addr)),
+ (V6_vL32b_ai IntRegs:$addr, #0) >,
+ Requires<[UseHVXSgl]>;
+
+ // 128B Load
+ def : Pat < (VTDbl (load IntRegs:$addr)),
+ (V6_vL32b_ai_128B IntRegs:$addr, #0) >,
+ Requires<[UseHVXDbl]>;
+
+ // Fold Add R+IFF into vector load.
+ let AddedComplexity = 10 in
+ def : Pat<(VTDbl (load (add IntRegs:$src2, s4_7ImmPred:$offset))),
+ (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+ Requires<[UseHVXDbl]>;
+
+ let AddedComplexity = 10 in
+ def : Pat<(VTSgl (load (add IntRegs:$src2, s4_6ImmPred:$offset))),
+ (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+ Requires<[UseHVXSgl]>;
+}
+
+defm : vL32b_ai_pats <v64i8, v128i8>;
+defm : vL32b_ai_pats <v32i16, v64i16>;
+defm : vL32b_ai_pats <v16i32, v32i32>;
+defm : vL32b_ai_pats <v8i64, v16i64>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STriq_pred_V6 : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VecPredRegs:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+
+def STriq_pred_vec_V6 : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+
+def STriq_pred_V6_128B : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VecPredRegs128B:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+
+def STriq_pred_vec_V6_128B : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector predicate pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDriq_pred_V6 : LDInst<(outs VecPredRegs:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def LDriq_pred_vec_V6 : LDInst<(outs VectorRegs:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def LDriq_pred_V6_128B : LDInst<(outs VecPredRegs128B:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+def LDriq_pred_vec_V6_128B : LDInst<(outs VectorRegs128B:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Store vector pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STriv_pseudo_V6 : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def STriv_pseudo_V6_128B : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+ isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def STrivv_pseudo_V6 : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VecDblRegs:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def STrivv_pseudo_V6_128B : STInst<(outs),
+ (ins IntRegs:$base, s32Imm:$offset, VecDblRegs128B:$src1),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDriv_pseudo_V6 : LDInst<(outs VectorRegs:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def LDriv_pseudo_V6_128B : LDInst<(outs VectorRegs128B:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+ opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def LDrivv_pseudo_V6 : LDInst<(outs VecDblRegs:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def LDrivv_pseudo_V6_128B : LDInst<(outs VecDblRegs128B:$dst),
+ (ins IntRegs:$base, s32Imm:$offset),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+ string cstr = "", InstrItinClass itin = CVI_VA_DV,
+ IType type = TypeCVI_VA_DV>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+def VSelectPseudo_V6 : VSELInst<(outs VectorRegs:$dst),
+ (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+def VSelectDblPseudo_V6 : VSELInst<(outs VecDblRegs:$dst),
+ (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3),
+ ".error \"should not emit\" ",
+ []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+}
+
+def : Pat <(v16i32 (selectcc (i32 IntRegs:$lhs), (i32 IntRegs:$rhs),
+ (v16i32 VectorRegs:$tval),
+ (v16i32 VectorRegs:$fval), SETEQ)),
+ (v16i32 (VSelectPseudo_V6 (i32 (C2_cmpeq (i32 IntRegs:$lhs),
+ (i32 IntRegs:$rhs))),
+ (v16i32 VectorRegs:$tval),
+ (v16i32 VectorRegs:$fval)))>;
+
+
+let hasNewValue = 1 in
+class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+ asmString >;
+
+multiclass T_vmpy <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_vmpy <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_vmpy_VV <string asmString>:
+ T_vmpy <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_vmpy_WW <string asmString>:
+ T_vmpy <asmString, VecDblRegs, VecDblRegs>;
+
+multiclass T_vmpy_VW <string asmString>:
+ T_vmpy <asmString, VectorRegs, VecDblRegs>;
+
+multiclass T_vmpy_WV <string asmString>:
+ T_vmpy <asmString, VecDblRegs, VectorRegs>;
+
+defm V6_vtmpyb :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
+defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
+defm V6_vdsaduh :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
+defm V6_vmpybus :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
+defm V6_vmpabus :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
+defm V6_vmpahb :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
+defm V6_vmpyh :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
+defm V6_vmpyuh :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
+defm V6_vmpyiwh :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
+defm V6_vtmpyhb :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
+defm V6_vmpyub :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
+
+let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
+defm V6_vmpyihb :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
+
+defm V6_vdmpybus_dv :
+ T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
+defm V6_vdmpyhsusat :
+ T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
+defm V6_vdmpyhsuisat :
+ T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
+defm V6_vdmpyhsat :
+ T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
+defm V6_vdmpyhisat :
+ T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
+defm V6_vdmpyhb_dv :
+ T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
+defm V6_vmpyhss :
+ T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
+defm V6_vmpyhsrs :
+ T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in
+defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vdmpyhb : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
+defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
+defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
+defm V6_vmpyiwb : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
+defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrw : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
+defm V6_vasrh : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
+defm V6_vaslw : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
+defm V6_vaslh : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
+defm V6_vlsrw : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
+defm V6_vlsrh : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
+}
+
+let hasNewValue = 1 in
+class T_HVX_alu <string asmString, InstrItinClass itin,
+ RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+ asmString >{
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu <string asmString, RegisterClass RCout,
+ RegisterClass RCin, InstrItinClass itin> {
+ def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_alu_VV <string asmString>:
+ T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_WW <string asmString>:
+ T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
+
+multiclass T_HVX_alu_WV <string asmString>:
+ T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
+
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vrmpyubv :
+ T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
+defm V6_vrmpybv :
+ T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
+defm V6_vrmpybusv :
+ T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
+defm V6_vabsdiffub :
+ T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
+defm V6_vabsdiffh :
+ T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
+defm V6_vabsdiffuh :
+ T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
+defm V6_vabsdiffw :
+ T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
+}
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhvsat :
+ T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
+defm V6_vmpyhvsrs :
+ T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
+defm V6_vmpyih :
+ T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
+}
+
+defm V6_vand :
+ T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
+defm V6_vor :
+ T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
+defm V6_vxor :
+ T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
+defm V6_vaddw :
+ T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
+defm V6_vaddubsat :
+ T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
+defm V6_vadduhsat :
+ T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
+defm V6_vaddhsat :
+ T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
+defm V6_vaddwsat :
+ T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
+defm V6_vsubb :
+ T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
+defm V6_vsubh :
+ T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
+defm V6_vsubw :
+ T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
+defm V6_vsububsat :
+ T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
+defm V6_vsubuhsat :
+ T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
+defm V6_vsubhsat :
+ T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
+defm V6_vsubwsat :
+ T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
+defm V6_vavgub :
+ T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
+defm V6_vavguh :
+ T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
+defm V6_vavgh :
+ T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
+defm V6_vavgw :
+ T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
+defm V6_vnavgub :
+ T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
+defm V6_vnavgh :
+ T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
+defm V6_vnavgw :
+ T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
+defm V6_vavgubrnd :
+ T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
+defm V6_vavguhrnd :
+ T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
+defm V6_vavghrnd :
+ T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
+defm V6_vavgwrnd :
+ T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
+
+defm V6_vmpybv :
+ T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
+defm V6_vmpyubv :
+ T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
+defm V6_vmpybusv :
+ T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
+defm V6_vmpyhv :
+ T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
+defm V6_vmpyuhv :
+ T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
+defm V6_vmpyhus :
+ T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
+defm V6_vaddubh :
+ T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
+defm V6_vadduhw :
+ T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
+defm V6_vaddhw :
+ T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
+defm V6_vsububh :
+ T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
+defm V6_vsubuhw :
+ T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
+defm V6_vsubhw :
+ T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
+
+defm V6_vaddb_dv :
+ T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
+defm V6_vaddh_dv :
+ T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
+defm V6_vaddw_dv :
+ T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
+defm V6_vaddubsat_dv :
+ T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
+defm V6_vadduhsat_dv :
+ T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
+defm V6_vaddhsat_dv :
+ T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
+defm V6_vaddwsat_dv :
+ T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
+defm V6_vsubb_dv :
+ T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
+defm V6_vsubh_dv :
+ T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
+defm V6_vsubw_dv :
+ T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
+defm V6_vsububsat_dv :
+ T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
+defm V6_vsubuhsat_dv :
+ T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
+defm V6_vsubhsat_dv :
+ T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
+defm V6_vsubwsat_dv :
+ T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
+defm V6_vmpabusv :
+ T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
+defm V6_vmpabuuv :
+ T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1 in
+class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
+ RegisterClass RCin1, RegisterClass RCin2>
+ : CVI_VA_Resource1 <(outs RCout:$dst),
+ (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
+ RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
+ def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin1#"128B"),
+ !cast<RegisterClass>(RCin2#
+ !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
+}
+
+multiclass T_HVX_vmpyacc_VVR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
+
+multiclass T_HVX_vmpyacc_VWR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WWR <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_VVV <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVV <string asmString>:
+ T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+
+defm V6_vtmpyb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
+ V6_vtmpyb_acc_enc;
+defm V6_vtmpybus_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
+ V6_vtmpybus_acc_enc;
+defm V6_vtmpyhb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
+ V6_vtmpyhb_acc_enc;
+defm V6_vdmpyhb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+ V6_vdmpyhb_acc_enc;
+defm V6_vrmpyub_acc :
+ T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+ V6_vrmpyub_acc_enc;
+defm V6_vrmpybus_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+ V6_vrmpybus_acc_enc;
+defm V6_vdmpybus_acc :
+ T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+ V6_vdmpybus_acc_enc;
+defm V6_vdmpybus_dv_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+ V6_vdmpybus_dv_acc_enc;
+defm V6_vdmpyhsuisat_acc :
+ T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
+ V6_vdmpyhsuisat_acc_enc;
+defm V6_vdmpyhisat_acc :
+ T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhisat_acc_enc;
+defm V6_vdmpyhb_dv_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+ V6_vdmpyhb_dv_acc_enc;
+defm V6_vmpybus_acc :
+ T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
+ V6_vmpybus_acc_enc;
+defm V6_vmpabus_acc :
+ T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
+ V6_vmpabus_acc_enc;
+defm V6_vmpahb_acc :
+ T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
+ V6_vmpahb_acc_enc;
+defm V6_vmpyhsat_acc :
+ T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
+ V6_vmpyhsat_acc_enc;
+defm V6_vmpyuh_acc :
+ T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+ V6_vmpyuh_acc_enc;
+defm V6_vmpyiwb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
+ V6_vmpyiwb_acc_enc;
+defm V6_vdsaduh_acc :
+ T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
+ V6_vdsaduh_acc_enc;
+defm V6_vmpyihb_acc :
+ T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
+ V6_vmpyihb_acc_enc;
+defm V6_vmpyub_acc :
+ T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+ V6_vmpyub_acc_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhsusat_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
+ V6_vdmpyhsusat_acc_enc;
+defm V6_vdmpyhsat_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhsat_acc_enc;
+defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
+ <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vaslw_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
+defm V6_vasrw_acc :
+ T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
+}
+
+defm V6_vdmpyhvsat_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+ V6_vdmpyhvsat_acc_enc;
+defm V6_vmpybusv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
+ V6_vmpybusv_acc_enc;
+defm V6_vmpybv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
+defm V6_vmpyhus_acc :
+ T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
+defm V6_vmpyhv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
+defm V6_vmpyiewh_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
+ V6_vmpyiewh_acc_enc;
+defm V6_vmpyiewuh_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
+ V6_vmpyiewuh_acc_enc;
+defm V6_vmpyih_acc :
+ T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
+defm V6_vmpyowh_rnd_sacc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
+ V6_vmpyowh_rnd_sacc_enc;
+defm V6_vmpyowh_sacc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
+ V6_vmpyowh_sacc_enc;
+defm V6_vmpyubv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+ V6_vmpyubv_acc_enc;
+defm V6_vmpyuhv_acc :
+ T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+ V6_vmpyuhv_acc_enc;
+defm V6_vrmpybusv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+ V6_vrmpybusv_acc_enc;
+defm V6_vrmpybv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
+defm V6_vrmpyubv_acc :
+ T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+ V6_vrmpyubv_acc_enc;
+
+
+class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = CVI_VA;
+ let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_vcmp <string asmString> {
+ def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
+defm V6_veqh_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
+defm V6_veqw_and :
+ T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
+defm V6_vgtb_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
+defm V6_vgth_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
+defm V6_vgtw_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
+defm V6_vgtub_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
+defm V6_vgtuh_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
+defm V6_vgtuw_and :
+ T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
+defm V6_veqb_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
+defm V6_veqh_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
+defm V6_veqw_or :
+ T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
+defm V6_vgtb_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
+defm V6_vgth_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
+defm V6_vgtw_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
+defm V6_vgtub_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
+defm V6_vgtuh_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
+defm V6_vgtuw_or :
+ T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
+defm V6_veqb_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
+defm V6_veqh_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
+defm V6_veqw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
+defm V6_vgtb_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
+defm V6_vgth_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
+defm V6_vgtw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
+defm V6_vgtub_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
+defm V6_vgtuh_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
+defm V6_vgtuw_xor :
+ T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
+
+defm V6_vminub :
+ T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
+defm V6_vminuh :
+ T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
+defm V6_vminh :
+ T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
+defm V6_vminw :
+ T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
+defm V6_vmaxub :
+ T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
+defm V6_vmaxuh :
+ T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
+defm V6_vmaxh :
+ T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
+defm V6_vmaxw :
+ T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
+defm V6_vshuffeb :
+ T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
+defm V6_vshuffob :
+ T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
+defm V6_vshufeh :
+ T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
+defm V6_vshufoh :
+ T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vmpyowh_rnd :
+ T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
+ V6_vmpyowh_rnd_enc;
+defm V6_vmpyiewuh :
+ T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
+defm V6_vmpyewuh :
+ T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
+defm V6_vmpyowh :
+ T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
+defm V6_vmpyiowh :
+ T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
+}
+let Itinerary = CVI_VX, Type = TypeCVI_VX in
+defm V6_vmpyieoh :
+ T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
+defm V6_vshufoeh :
+ T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
+defm V6_vshufoeb :
+ T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
+}
+
+let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+defm V6_vcombine :
+ T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
+
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+ SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+ (v16i32 VectorRegs:$Vt))),
+ (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+ Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+ (v32i32 VecDblRegs:$Vt))),
+ (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+ Requires<[UseHVXDbl]>;
+
+let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
+defm V6_vsathub :
+ T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
+defm V6_vsatwh :
+ T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vroundwh :
+ T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
+defm V6_vroundwuh :
+ T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
+defm V6_vroundhb :
+ T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
+defm V6_vroundhub :
+ T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
+defm V6_vasrwv :
+ T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
+defm V6_vlsrwv :
+ T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
+defm V6_vlsrhv :
+ T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
+defm V6_vasrhv :
+ T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
+defm V6_vaslwv :
+ T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
+defm V6_vaslhv :
+ T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
+}
+
+defm V6_vaddb :
+ T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
+defm V6_vaddh :
+ T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdelta :
+ T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
+defm V6_vrdelta :
+ T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
+defm V6_vdealb4w :
+ T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
+defm V6_vpackeb :
+ T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
+defm V6_vpackeh :
+ T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
+defm V6_vpackhub_sat :
+ T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
+defm V6_vpackhb_sat :
+ T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
+defm V6_vpackwuh_sat :
+ T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
+defm V6_vpackwh_sat :
+ T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
+defm V6_vpackob :
+ T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
+defm V6_vpackoh :
+ T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
+ : CVI_VA_Resource1 <(outs RC2:$dst),
+ (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
+ [], "$dst = $_src_" > {
+ let Itinerary = CVI_VA;
+ let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_condALU <string asmString> {
+ def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_vaddbq : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
+ V6_vaddbq_enc;
+defm V6_vaddhq : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
+ V6_vaddhq_enc;
+defm V6_vaddwq : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
+ V6_vaddwq_enc;
+defm V6_vsubbq : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
+ V6_vsubbq_enc;
+defm V6_vsubhq : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
+ V6_vsubhq_enc;
+defm V6_vsubwq : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
+ V6_vsubwq_enc;
+defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
+ V6_vaddbnq_enc;
+defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
+ V6_vaddhnq_enc;
+defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
+ V6_vaddwnq_enc;
+defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
+ V6_vsubbnq_enc;
+defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
+ V6_vsubhnq_enc;
+defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
+ V6_vsubwnq_enc;
+
+let hasNewValue = 1 in
+class T_HVX_alu_2op <string asmString, InstrItinClass itin,
+ RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
+ asmString >{
+ let Itinerary = itin;
+ let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
+ RegisterClass RCin, InstrItinClass itin> {
+ def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu_2op <asmString, itin,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+let hasNewValue = 1 in
+multiclass T_HVX_alu_2op_VV <string asmString>:
+ T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_2op_WV <string asmString>:
+ T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
+
+
+defm V6_vabsh : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
+ V6_vabsh_enc;
+defm V6_vabsw : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
+ V6_vabsw_enc;
+defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
+ V6_vabsh_sat_enc;
+defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
+ V6_vabsw_sat_enc;
+defm V6_vnot : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
+ V6_vnot_enc;
+defm V6_vassign : T_HVX_alu_2op_VV <"$dst = $src1">,
+ V6_vassign_enc;
+
+defm V6_vzb : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
+ V6_vzb_enc;
+defm V6_vzh : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
+ V6_vzh_enc;
+defm V6_vsb : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
+ V6_vsb_enc;
+defm V6_vsh : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
+ V6_vsh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdealh : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
+ V6_vdealh_enc;
+defm V6_vdealb : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
+ V6_vdealb_enc;
+defm V6_vshuffh : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
+ V6_vshuffh_enc;
+defm V6_vshuffb : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
+ V6_vshuffb_enc;
+}
+
+let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
+defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
+ V6_vunpackub_enc;
+defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
+ V6_vunpackuh_enc;
+defm V6_vunpackb : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
+ V6_vunpackb_enc;
+defm V6_vunpackh : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
+ V6_vunpackh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vcl0w : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
+ V6_vcl0w_enc;
+defm V6_vcl0h : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
+ V6_vcl0h_enc;
+defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
+ V6_vnormamtw_enc;
+defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
+ V6_vnormamth_enc;
+defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
+ V6_vpopcounth_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
+ Type = TypeCVI_VX_DV in
+class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$dst),
+ (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1Imm:$src3),
+ asmString, [], "$dst = $_src_" > ;
+
+
+multiclass T_HVX_vmpyacc2 <string asmString> {
+ def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi_acc :
+ T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
+ V6_vrmpybusi_acc_enc;
+defm V6_vrsadubi_acc :
+ T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
+ V6_vrsadubi_acc_enc;
+defm V6_vrmpyubi_acc :
+ T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
+ V6_vrmpyubi_acc_enc;
+
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
+class T_HVX_vmpy2 <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1Imm:$src3),
+ asmString>;
+
+
+multiclass T_HVX_vmpy2 <string asmString> {
+ def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi :
+ T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
+defm V6_vrsadubi :
+ T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
+defm V6_vrmpyubi :
+ T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
+
+
+let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
+ hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
+class T_HVX_perm <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
+ (ins RC:$src1, RC:$src2, IntRegs:$src3),
+ asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
+
+multiclass T_HVX_perm <string asmString> {
+ def NAME : T_HVX_perm <asmString, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
+ defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
+ defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
+}
+
+// Conditional vector move.
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_cmov <bit isPredNot, RegisterClass RC>
+ : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
+ "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_cmov <bit isPredNot = 0> {
+ def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
+}
+
+defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
+defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
+
+// Conditional vector combine.
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
+ hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 < (outs RCout:$dst),
+ (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
+ "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
+ let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_ccombine <bit isPredNot = 0> {
+ def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
+defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
+
+let hasNewValue = 1 in
+class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst),
+ (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+ asmString >;
+
+multiclass T_HVX_shift <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_shift <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_shift_VV <string asmString>:
+ T_HVX_shift <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_shift_WV <string asmString>:
+ T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
+defm V6_valignb :
+ T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
+defm V6_vlalignb :
+ T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrwh :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
+defm V6_vasrwhsat :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
+ V6_vasrwhsat_enc;
+defm V6_vasrwhrndsat :
+ T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
+ V6_vasrwhrndsat_enc;
+defm V6_vasrwuhsat :
+ T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
+ V6_vasrwuhsat_enc;
+defm V6_vasrhubsat :
+ T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
+ V6_vasrhubsat_enc;
+defm V6_vasrhubrndsat :
+ T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+ V6_vasrhubrndsat_enc;
+defm V6_vasrhbrndsat :
+ T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+ V6_vasrhbrndsat_enc;
+}
+
+// Assembler mapped -- alias?
+//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
+defm V6_vshuffvdd :
+ T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
+defm V6_vdealvdd :
+ T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
+}
+
+let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
+class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
+ asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_unpack <string asmString> {
+ def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
+defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
+ hasSideEffects = 0 in
+class T_HVX_valign <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3Imm:$src3),
+ asmString>;
+
+multiclass T_HVX_valign <string asmString> {
+ def NAME : T_HVX_valign <asmString, VectorRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
+}
+
+defm V6_valignbi :
+ T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
+defm V6_vlalignbi :
+ T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+class T_HVX_predAlu <string asmString, RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
+ asmString>;
+
+multiclass T_HVX_predAlu <string asmString> {
+ def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
+
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
+}
+
+defm V6_pred_and : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
+defm V6_pred_or : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
+defm V6_pred_xor : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
+defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
+defm V6_pred_and_n :
+ T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_prednot <RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
+ "$dst = not($src1)">, V6_pred_not_enc;
+
+def V6_pred_not : T_HVX_prednot <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+ asmString >;
+
+multiclass T_HVX_vcmp2 <string asmString> {
+ def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
+defm V6_veqh : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
+defm V6_veqw : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
+defm V6_vgtb : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
+defm V6_vgth : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
+defm V6_vgtw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
+defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
+defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
+defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
+
+let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+ "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
+
+def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
+
+let isAccumulator = 1 in
+class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+ "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
+
+def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst),
+ (ins RCin:$src1, IntRegs:$src2),
+ "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
+
+def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_lvsplatw <RegisterClass RC>
+ : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
+ "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
+
+def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
+
+
+let hasNewValue = 1 in
+class T_V6_vinsertwr <RegisterClass RC>
+ : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
+ "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
+ V6_vinsertwr_enc;
+
+def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
+class T_V6_pred_scalar2 <RegisterClass RC>
+ : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
+ "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
+
+def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
+
+class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
+ : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+ "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
+
+def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
+ : SInst2 <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2), asmString>;
+
+class T_HVX_rol_R <string asmString>
+ : T_HVX_rol <asmString, IntRegs, u5Imm>;
+class T_HVX_rol_P <string asmString>
+ : T_HVX_rol <asmString, DoubleRegs, u6Imm>;
+
+def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
+let hasNewValue = 1, opNewValue = 0 in
+def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
+ : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
+ asmString, [], "$dst = $_src_" >;
+
+class T_HVX_rol_acc_P <string asmString>
+ : T_HVX_rol_acc <asmString, DoubleRegs, u6Imm>;
+
+class T_HVX_rol_acc_R <string asmString>
+ : T_HVX_rol_acc <asmString, IntRegs, u5Imm>;
+
+def S6_rol_i_p_nac :
+ T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
+def S6_rol_i_p_acc :
+ T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
+def S6_rol_i_p_and :
+ T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
+def S6_rol_i_p_or :
+ T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
+def S6_rol_i_p_xacc :
+ T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
+
+let hasNewValue = 1, opNewValue = 0 in {
+def S6_rol_i_r_nac :
+ T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
+def S6_rol_i_r_acc :
+ T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
+def S6_rol_i_r_and :
+ T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
+def S6_rol_i_r_or :
+ T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
+def S6_rol_i_r_xacc :
+ T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
+}
+
+let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
+class T_V6_extractw <RegisterClass RC>
+ : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
+ "$dst = vextract($src1,$src2)">, V6_extractw_enc;
+
+def V6_extractw : T_V6_extractw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
+
+let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT in
+class T_sys0op <string asmString>
+ : ST1Inst <(outs), (ins), asmString>;
+
+let isSolo = 1, validSubTargets = HasV55SubT in {
+def Y5_l2gunlock : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
+def Y5_l2gclean : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
+def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
+}
+
+class T_sys1op <string asmString, RegisterClass RC>
+ : ST1Inst <(outs), (ins RC:$src1), asmString>;
+
+class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
+class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
+
+let isSoloAX = 1, validSubTargets = HasV55SubT in
+def Y5_l2unlocka : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
+
+let isSolo = 1, validSubTargets = HasV60SubT in {
+def Y6_l2gcleanpa : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
+def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
+}
+
+let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
+ validSubTargets = HasV55SubT in
+def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
+ "$dst = l2locka($src1)">, Y5_l2locka_enc;
+
+// not defined on etc side. why?
+// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
+
+let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
+ hasSideEffects = 0,
+validSubTargets = HasV55SubT in
+def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
+ (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
+ "$dst1,$dst2 = vacsh($src1,$src2)", [],
+ "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
+ hasSideEffects = 0 in
+class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
+ RegisterClass RCin2>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
+
+multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
+ def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
+ VecPredRegs128B, VectorRegs128B>;
+}
+
+multiclass T_HVX_alu2_V <string asmString> :
+ T_HVX_alu2 <asmString, VectorRegs>;
+
+multiclass T_HVX_alu2_W <string asmString> :
+ T_HVX_alu2 <asmString, VecDblRegs>;
+
+defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
+ hasSideEffects = 0 in
+defm V6_vmux : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
+
+class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
+
+multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_V <string asmString> :
+ T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_W <string asmString> :
+ T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
+class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+ RegisterClass RCin>
+ : CVI_VA_Resource1<(outs RCout:$dst),
+ (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+ asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+ RegisterClass RCin> {
+ def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
+ let isCodeGenOnly = 1 in
+ def NAME#_128B : T_HVX_vlutb_acc<asmString,
+ !cast<RegisterClass>(RCout#"128B"),
+ !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_acc_V <string asmString> :
+ T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_acc_W <string asmString> :
+ T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
+defm V6_vlutvvb:
+ T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
+defm V6_vlutvwh:
+ T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
+
+let hasNewValue = 1 in {
+ defm V6_vlutvvb_oracc:
+ T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
+ V6_vlutvvb_oracc_enc;
+ defm V6_vlutvwh_oracc:
+ T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
+ V6_vlutvwh_oracc_enc;
+}
+
+// It's a fake instruction and should not be defined?
+def S2_cabacencbin
+ : SInst2<(outs DoubleRegs:$dst),
+ (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+ "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
+
+// Vhist instructions
+def V6_vhistq
+ : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
+ "vhist($src1)">, V6_vhistq_enc;
+
+def V6_vhist
+ : CVI_HIST_Resource1 <(outs), (ins),
+ "vhist" >, V6_vhist_enc;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td
index f4fb946d5bad..96dd5315b87f 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -35,6 +35,34 @@ multiclass bitconvert_64<ValueType a, ValueType b> {
(a DoubleRegs:$src)>;
}
+multiclass bitconvert_vec<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a VectorRegs:$src))),
+ (b VectorRegs:$src)>;
+ def : Pat <(a (bitconvert (b VectorRegs:$src))),
+ (a VectorRegs:$src)>;
+}
+
+multiclass bitconvert_dblvec<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a VecDblRegs:$src))),
+ (b VecDblRegs:$src)>;
+ def : Pat <(a (bitconvert (b VecDblRegs:$src))),
+ (a VecDblRegs:$src)>;
+}
+
+multiclass bitconvert_predvec<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a VecPredRegs:$src))),
+ (b VectorRegs:$src)>;
+ def : Pat <(a (bitconvert (b VectorRegs:$src))),
+ (a VecPredRegs:$src)>;
+}
+
+multiclass bitconvert_dblvec128B<ValueType a, ValueType b> {
+ def : Pat <(b (bitconvert (a VecDblRegs128B:$src))),
+ (b VecDblRegs128B:$src)>;
+ def : Pat <(a (bitconvert (b VecDblRegs128B:$src))),
+ (a VecDblRegs128B:$src)>;
+}
+
// Bit convert vector types.
defm : bitconvert_32<v4i8, i32>;
defm : bitconvert_32<v2i16, i32>;
@@ -47,6 +75,21 @@ defm : bitconvert_64<v8i8, v4i16>;
defm : bitconvert_64<v8i8, v2i32>;
defm : bitconvert_64<v4i16, v2i32>;
+defm : bitconvert_vec<v64i8, v16i32>;
+defm : bitconvert_vec<v8i64 , v16i32>;
+defm : bitconvert_vec<v32i16, v16i32>;
+
+defm : bitconvert_dblvec<v16i64, v128i8>;
+defm : bitconvert_dblvec<v32i32, v128i8>;
+defm : bitconvert_dblvec<v64i16, v128i8>;
+
+defm : bitconvert_dblvec128B<v64i32, v128i16>;
+defm : bitconvert_dblvec128B<v256i8, v128i16>;
+defm : bitconvert_dblvec128B<v32i64, v128i16>;
+
+defm : bitconvert_dblvec128B<v64i32, v256i8>;
+defm : bitconvert_dblvec128B<v32i64, v256i8>;
+defm : bitconvert_dblvec128B<v128i16, v256i8>;
// Vector shift support. Vector shifting in Hexagon is rather different
// from internal representation of LLVM.
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index 1d0d015f798b..b207aaf392f4 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -691,15 +691,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
-def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>;
+def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>;
def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))),
(i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>;
// Mux
-def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>;
-def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>;
-def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>;
+def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>;
+def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>;
+def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>;
// Shift halfword
def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
@@ -720,17 +720,17 @@ def : T_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>;
def : T_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>;
def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
-def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>;
-def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>;
-def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>;
+def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>;
+def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>;
+def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>;
-def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)),
+def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)),
(i32 (C2_cmpgti (I32:$src1),
- (DEC_CONST_SIGNED s8ExtPred:$src2)))>;
+ (DEC_CONST_SIGNED s32ImmPred:$src2)))>;
-def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)),
+def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)),
(i32 (C2_cmpgtui (I32:$src1),
- (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>;
+ (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>;
// The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0.
def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)),
@@ -1289,3 +1289,5 @@ def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
include "HexagonIntrinsicsV3.td"
include "HexagonIntrinsicsV4.td"
include "HexagonIntrinsicsV5.td"
+include "HexagonIntrinsicsV60.td"
+
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
new file mode 100644
index 000000000000..24a3e4d36de9
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -0,0 +1,836 @@
+//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in {
+def HEXAGON_V6_vd0_pseudo : CVI_VA_Resource<(outs VectorRegs:$dst),
+ (ins ),
+ "$dst=#0",
+ [(set VectorRegs:$dst, (int_hexagon_V6_vd0 ))]>;
+
+def HEXAGON_V6_vd0_pseudo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+ (ins ),
+ "$dst=#0",
+ [(set VectorRegs128B:$dst, (int_hexagon_V6_vd0_128B ))]>;
+}
+let isPseudo = 1 in
+def HEXAGON_V6_vassignp : CVI_VA_Resource<(outs VecDblRegs:$dst),
+ (ins VecDblRegs:$src1),
+ "$dst=vassignp_W($src1)",
+ [(set VecDblRegs:$dst, (int_hexagon_V6_vassignp VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
+ (ins VecDblRegs128B:$src1),
+ "$dst=vassignp_W_128B($src1)",
+ [(set VecDblRegs128B:$dst, (int_hexagon_V6_vassignp_128B
+ VecDblRegs128B:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_lo : CVI_VA_Resource<(outs VectorRegs:$dst),
+ (ins VecDblRegs:$src1),
+ "$dst=lo_W($src1)",
+ [(set VectorRegs:$dst, (int_hexagon_V6_lo VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_hi : CVI_VA_Resource<(outs VectorRegs:$dst),
+ (ins VecDblRegs:$src1),
+ "$dst=hi_W($src1)",
+ [(set VectorRegs:$dst, (int_hexagon_V6_hi VecDblRegs:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_lo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+ (ins VecDblRegs128B:$src1),
+ "$dst=lo_W($src1)",
+ [(set VectorRegs128B:$dst, (int_hexagon_V6_lo_128B VecDblRegs128B:$src1))]>;
+
+let isPseudo = 1 in
+def HEXAGON_V6_hi_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst),
+ (ins VecDblRegs128B:$src1),
+ "$dst=hi_W($src1)",
+ [(set VectorRegs128B:$dst, (int_hexagon_V6_hi_128B VecDblRegs128B:$src1))]>;
+
+let AddedComplexity = 100 in {
+def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))),
+ (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_loreg)) >,
+ Requires<[UseHVXSgl]>;
+
+def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))),
+ (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_hireg)) >,
+ Requires<[UseHVXSgl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))),
+ (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1),
+ subreg_loreg)) >,
+ Requires<[UseHVXDbl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))),
+ (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1),
+ subreg_hireg)) >,
+ Requires<[UseHVXDbl]>;
+}
+
+def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v64i8 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v64i8 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v8i64 VectorRegs:$src1))),
+ (v512i1 (V6_vandvrt(v8i64 VectorRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v64i8 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v64i8 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v8i64 (bitconvert (v512i1 VecPredRegs:$src1))),
+ (v8i64 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v128i8 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v128i8 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v16i64 VectorRegs128B:$src1))),
+ (v1024i1 (V6_vandvrt_128B(v16i64 VectorRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v128i8 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v128i8 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v16i64 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+ (v16i64 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+
+let AddedComplexity = 140 in {
+def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)),
+ (V6_vS32b_ai IntRegs:$addr, 0,
+ (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1),
+ (A2_tfrsi 0x01010101))))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
+ (v512i1 (V6_vandvrt
+ (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXSgl]>;
+
+def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)),
+ (V6_vS32b_ai_128B IntRegs:$addr, 0,
+ (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1),
+ (A2_tfrsi 0x01010101))))>,
+ Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
+ (v1024i1 (V6_vandvrt_128B
+ (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)),
+ (A2_tfrsi 0x01010101)))>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+ (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1),
+ (MI VectorRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1),
+ (MI VecPredRegs:$src1)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+ (MI VecDblRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+ (MI VectorRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2),
+ (MI VecDblRegs:$src1, VectorRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+ (MI VectorRegs:$src1, VectorRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+ (MI VecPredRegs:$src1, IntRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ IntRegs:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ IntRegs:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+ (MI VecPredRegs:$src1, VecPredRegs:$src2)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VecPredRegs128B:$src2),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VecPredRegs128B:$src2)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+ (MI VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+ (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VecPredRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VecPredRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+
+multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+ (MI VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+ VectorRegs128B:$src2,
+ IntRegs:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+ (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2, imm:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2, imm:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3),
+ (MI VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ IntRegs:$src2, imm:$src3),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ IntRegs:$src2, imm:$src3)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4),
+ (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3, imm:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VecDblRegs128B:$src2,
+ IntRegs:$src3, imm:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4),
+ (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
+ def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4),
+ (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXSgl]>;
+
+ def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4),
+ (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1,
+ VectorRegs128B:$src2,
+ VectorRegs128B:$src3,
+ IntRegs:$src4)>,
+ Requires<[UseHVXDbl]>;
+}
+
+defm : T_WR_pat<V6_vtmpyb, int_hexagon_V6_vtmpyb>;
+defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
+defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
+defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
+defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
+defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
+defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
+defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
+defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
+defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
+defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
+defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
+defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
+defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
+defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
+defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
+defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
+defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
+defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
+defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
+defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
+defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
+defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
+defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
+defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
+defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
+defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
+defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
+defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
+defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
+defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
+defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
+
+defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
+defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
+defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
+defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
+defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
+defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
+defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
+defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
+defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
+defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
+defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
+defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
+defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
+defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
+defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
+defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
+defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
+defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
+defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
+defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
+defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
+defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
+defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
+defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
+defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
+defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
+defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
+defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
+defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
+defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
+defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
+defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
+defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
+defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
+defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
+defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
+defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
+defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
+defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
+defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
+defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
+defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
+defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
+defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
+defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
+defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
+defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
+defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
+defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
+defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
+defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
+defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
+defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
+defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
+defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
+defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
+defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
+defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
+defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
+defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
+defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
+defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
+defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
+defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
+
+defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
+defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
+defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
+defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
+defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
+defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
+defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
+defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
+defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
+
+defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
+defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
+
+defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
+defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
+defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
+defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
+
+defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
+defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
+defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
+defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
+defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
+defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
+defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
+defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
+
+defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
+defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
+defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
+defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
+defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
+defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
+defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
+defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
+defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
+defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
+defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
+defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
+defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
+defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
+defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
+
+// Compare instructions
+defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
+defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
+defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
+defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
+defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
+defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
+defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
+defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
+defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
+defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
+defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
+defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
+defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
+defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
+defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
+defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
+defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
+defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
+defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
+defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
+defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
+defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
+defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
+defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
+defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
+defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
+defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
+
+defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
+defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
+defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
+defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
+defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
+defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
+defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
+defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
+defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
+defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
+defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
+defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
+defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
+defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
+defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
+defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
+defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
+defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
+defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
+defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
+defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
+defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
+defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
+defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
+defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
+defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
+defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
+defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
+defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
+defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
+defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
+defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
+defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
+defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
+defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
+defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
+defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
+defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
+defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
+defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
+defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
+defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
+defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
+defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
+defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
+defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
+
+defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
+defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
+defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
+defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
+defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
+defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
+defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
+defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
+defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
+defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
+defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
+defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
+
+defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
+defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
+defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
+defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
+defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
+defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
+defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
+defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
+defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
+defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
+defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
+defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
+defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
+defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
+defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
+defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
+defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
+defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
+defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
+defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
+defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
+defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
+defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
+
+defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
+defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
+defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
+
+defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
+defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
+defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
+
+// assembler mapped.
+//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
+// not present earlier.. need to add intrinsic
+defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
+defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
+defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
+defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
+defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
+defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
+defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
+defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
+defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
+
+defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
+defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
+
+defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
+defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
+defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
+defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
+
+defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
+defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
+defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
+defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
+defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
+defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
+defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
+defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
+defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
+defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
+defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
+defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
+defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
+defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
+defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
+defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
+defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
+
+defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
+defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
+defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
+defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
+
+defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
+defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
+defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
+defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
+
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
+def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
+def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
+def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
+def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
+def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
+def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
+def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
+def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
+def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
+def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
+def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
+
+defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
+defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
+
+def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+
+def: Pat<(v64i16 (trunc v64i32:$Vdd)),
+ (v64i16 (V6_vpackwh_sat_128B
+ (v32i32 (HEXAGON_V6_hi_128B VecDblRegs128B:$Vdd)),
+ (v32i32 (HEXAGON_V6_lo_128B VecDblRegs128B:$Vdd))))>,
+ Requires<[UseHVXDbl]>;
+
+
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 75189b696ea2..624c0f6cf49d 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -26,39 +26,71 @@
using namespace llvm;
-static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
- HexagonAsmPrinter& Printer) {
+namespace llvm {
+ void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+ HexagonAsmPrinter &Printer) {
MCContext &MC = Printer.OutContext;
const MCExpr *ME;
- ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC);
+ // Populate the relocation type based on Hexagon target flags
+ // set on an operand
+ MCSymbolRefExpr::VariantKind RelocationType;
+ switch (MO.getTargetFlags()) {
+ default:
+ RelocationType = MCSymbolRefExpr::VK_None;
+ break;
+ case HexagonII::MO_PCREL:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+ break;
+ case HexagonII::MO_GOT:
+ RelocationType = MCSymbolRefExpr::VK_GOT;
+ break;
+ case HexagonII::MO_LO16:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+ break;
+ case HexagonII::MO_HI16:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+ break;
+ case HexagonII::MO_GPREL:
+ RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+ break;
+ }
+
+ ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
if (!MO.isJTI() && MO.getOffset())
ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
MC);
- return (MCOperand::createExpr(ME));
+ return MCOperand::createExpr(ME);
}
// Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
- HexagonAsmPrinter& AP) {
- if(MI->getOpcode() == Hexagon::ENDLOOP0){
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+ MCInst &MCB, HexagonAsmPrinter &AP) {
+ if (MI->getOpcode() == Hexagon::ENDLOOP0) {
HexagonMCInstrInfo::setInnerLoop(MCB);
return;
}
- if(MI->getOpcode() == Hexagon::ENDLOOP1){
+ if (MI->getOpcode() == Hexagon::ENDLOOP1) {
HexagonMCInstrInfo::setOuterLoop(MCB);
return;
}
- MCInst* MCI = new (AP.OutContext) MCInst;
+ MCInst *MCI = new (AP.OutContext) MCInst;
MCI->setOpcode(MI->getOpcode());
assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
"MCI opcode should have been set on construction");
+ bool MustExtend = false;
for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
const MachineOperand &MO = MI->getOperand(i);
MCOperand MCO;
+ if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
+ MustExtend = true;
switch (MO.getType()) {
default:
@@ -73,11 +105,14 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
APFloat Val = MO.getFPImm()->getValueAPF();
// FP immediates are used only when setting GPRs, so they may be dealt
// with like regular immediates from this point on.
- MCO = MCOperand::createImm(*Val.bitcastToAPInt().getRawData());
+ MCO = MCOperand::createExpr(
+ MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
+ AP.OutContext));
break;
}
case MachineOperand::MO_Immediate:
- MCO = MCOperand::createImm(MO.getImm());
+ MCO = MCOperand::createExpr(
+ MCConstantExpr::create(MO.getImm(), AP.OutContext));
break;
case MachineOperand::MO_MachineBasicBlock:
MCO = MCOperand::createExpr
@@ -104,5 +139,8 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
MCI->addOperand(MCO);
}
+ AP.HexagonProcessInstruction(*MCI, *MI);
+ HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI,
+ MustExtend);
MCB.addOperand(MCOperand::createInst(MCI));
}
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 35f732cd6207..7a52d6874c33 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -179,7 +179,11 @@ void VLIWMachineScheduler::schedule() {
initQueues(TopRoots, BotRoots);
bool IsTopNode = false;
- while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+ while (true) {
+ DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+ SUnit *SU = SchedImpl->pickNode(IsTopNode);
+ if (!SU) break;
+
if (!checkSchedLimit())
break;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 707bfdbb6ab6..20c4ab112b5f 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -92,6 +92,7 @@ namespace {
/// \brief A handle to the branch probability pass.
const MachineBranchProbabilityInfo *MBPI;
+ bool isNewValueJumpCandidate(const MachineInstr *MI) const;
};
} // end of anonymous namespace
@@ -280,9 +281,9 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
return true;
}
-// Given a compare operator, return a matching New Value Jump
-// compare operator. Make sure that MI here is included in
-// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+
+// Given a compare operator, return a matching New Value Jump compare operator.
+// Make sure that MI here is included in isNewValueJumpCandidate.
static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
bool secondRegNewified,
MachineBasicBlock *jmpTarget,
@@ -341,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
: Hexagon::J4_cmpgtui_t_jumpnv_nt;
+ case Hexagon::C4_cmpneq:
+ return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+ : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+ case Hexagon::C4_cmplte:
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+ : Hexagon::J4_cmplt_f_jumpnv_nt;
+ return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+ : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+ case Hexagon::C4_cmplteu:
+ if (secondRegNewified)
+ return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+ : Hexagon::J4_cmpltu_f_jumpnv_nt;
+ return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+ : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
default:
llvm_unreachable("Could not find matching New Value Jump instruction.");
}
@@ -348,6 +367,26 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
return 0;
}
+bool HexagonNewValueJump::isNewValueJumpCandidate(const MachineInstr *MI)
+ const {
+ switch (MI->getOpcode()) {
+ case Hexagon::C2_cmpeq:
+ case Hexagon::C2_cmpeqi:
+ case Hexagon::C2_cmpgt:
+ case Hexagon::C2_cmpgti:
+ case Hexagon::C2_cmpgtu:
+ case Hexagon::C2_cmpgtui:
+ case Hexagon::C4_cmpneq:
+ case Hexagon::C4_cmplte:
+ case Hexagon::C4_cmplteu:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+
bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
@@ -372,7 +411,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
// Loop through all the bb's of the function
for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
DEBUG(dbgs() << "** dumping bb ** "
<< MBB->getNumber() << "\n");
@@ -468,7 +507,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
MI->getOperand(0).getReg() == predReg) {
// Not all compares can be new value compare. Arch Spec: 7.6.1.1
- if (QII->isNewValueJumpCandidate(MI)) {
+ if (isNewValueJumpCandidate(MI)) {
assert((MI->getDesc().isCompare()) &&
"Only compare instruction can be collapsed into New Value Jump");
@@ -591,8 +630,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
DebugLoc dl = MI->getDebugLoc();
MachineInstr *NewMI;
- assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
- "This compare is not a New Value Jump candidate.");
+ assert((isNewValueJumpCandidate(cmpInstr)) &&
+ "This compare is not a New Value Jump candidate.");
unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
isSecondOpNewified,
jmpTarget, MBPI);
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index 2bece8f42f53..fbd29cd4d6d1 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -1,4 +1,4 @@
-//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===//
+//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,59 +7,114 @@
//
//===----------------------------------------------------------------------===//
+def s32ImmOperand : AsmOperandClass { let Name = "s32Imm"; }
+def s8ImmOperand : AsmOperandClass { let Name = "s8Imm"; }
+def s8Imm64Operand : AsmOperandClass { let Name = "s8Imm64"; }
+def s6ImmOperand : AsmOperandClass { let Name = "s6Imm"; }
+def s4ImmOperand : AsmOperandClass { let Name = "s4Imm"; }
def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
-
+def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
+def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
+def u64ImmOperand : AsmOperandClass { let Name = "u64Imm"; }
+def u32ImmOperand : AsmOperandClass { let Name = "u32Imm"; }
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
+def u16ImmOperand : AsmOperandClass { let Name = "u16Imm"; }
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
+def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
+def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
+def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
+def u10ImmOperand : AsmOperandClass { let Name = "u10Imm"; }
+def u9ImmOperand : AsmOperandClass { let Name = "u9Imm"; }
+def u8ImmOperand : AsmOperandClass { let Name = "u8Imm"; }
+def u7ImmOperand : AsmOperandClass { let Name = "u7Imm"; }
+def u6ImmOperand : AsmOperandClass { let Name = "u6Imm"; }
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
+def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
+def u5ImmOperand : AsmOperandClass { let Name = "u5Imm"; }
+def u4ImmOperand : AsmOperandClass { let Name = "u4Imm"; }
+def u3ImmOperand : AsmOperandClass { let Name = "u3Imm"; }
+def u2ImmOperand : AsmOperandClass { let Name = "u2Imm"; }
+def u1ImmOperand : AsmOperandClass { let Name = "u1Imm"; }
+def n8ImmOperand : AsmOperandClass { let Name = "n8Imm"; }
// Immediate operands.
-let PrintMethod = "printImmOperand" in {
- def s32Imm : Operand<i32>;
- def s8Imm : Operand<i32>;
- def s8Imm64 : Operand<i64>;
- def s6Imm : Operand<i32>;
+let OperandType = "OPERAND_IMMEDIATE",
+ DecoderMethod = "unsignedImmDecoder" in {
+ def s32Imm : Operand<i32> { let ParserMatchClass = s32ImmOperand;
+ let DecoderMethod = "s32ImmDecoder"; }
+ def s8Imm : Operand<i32> { let ParserMatchClass = s8ImmOperand;
+ let DecoderMethod = "s8ImmDecoder"; }
+ def s8Imm64 : Operand<i64> { let ParserMatchClass = s8Imm64Operand;
+ let DecoderMethod = "s8ImmDecoder"; }
+ def s6Imm : Operand<i32> { let ParserMatchClass = s6ImmOperand;
+ let DecoderMethod = "s6_0ImmDecoder"; }
def s6_3Imm : Operand<i32>;
- def s4Imm : Operand<i32>;
- def s4_0Imm : Operand<i32> { let DecoderMethod = "s4_0ImmDecoder"; }
- def s4_1Imm : Operand<i32> { let DecoderMethod = "s4_1ImmDecoder"; }
- def s4_2Imm : Operand<i32> { let DecoderMethod = "s4_2ImmDecoder"; }
- def s4_3Imm : Operand<i32> { let DecoderMethod = "s4_3ImmDecoder"; }
- def u64Imm : Operand<i64>;
- def u32Imm : Operand<i32>;
- def u26_6Imm : Operand<i32>;
- def u16Imm : Operand<i32>;
- def u16_0Imm : Operand<i32>;
- def u16_1Imm : Operand<i32>;
- def u16_2Imm : Operand<i32>;
- def u16_3Imm : Operand<i32>;
- def u11_3Imm : Operand<i32>;
- def u10Imm : Operand<i32>;
- def u9Imm : Operand<i32>;
- def u8Imm : Operand<i32>;
- def u7Imm : Operand<i32>;
- def u6Imm : Operand<i32>;
- def u6_0Imm : Operand<i32>;
- def u6_1Imm : Operand<i32>;
- def u6_2Imm : Operand<i32>;
- def u6_3Imm : Operand<i32>;
- def u5Imm : Operand<i32>;
+ def s4Imm : Operand<i32> { let ParserMatchClass = s4ImmOperand;
+ let DecoderMethod = "s4_0ImmDecoder"; }
+ def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
+ let DecoderMethod = "s4_0ImmDecoder"; }
+ def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
+ let DecoderMethod = "s4_1ImmDecoder"; }
+ def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
+ let DecoderMethod = "s4_2ImmDecoder"; }
+ def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
+ let DecoderMethod = "s4_3ImmDecoder"; }
+ def u64Imm : Operand<i64> { let ParserMatchClass = u64ImmOperand; }
+ def u32Imm : Operand<i32> { let ParserMatchClass = u32ImmOperand; }
+ def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
+ def u16Imm : Operand<i32> { let ParserMatchClass = u16ImmOperand; }
+ def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
+ def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
+ def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
+ def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
+ def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
+ def u10Imm : Operand<i32> { let ParserMatchClass = u10ImmOperand; }
+ def u9Imm : Operand<i32> { let ParserMatchClass = u9ImmOperand; }
+ def u8Imm : Operand<i32> { let ParserMatchClass = u8ImmOperand; }
+ def u7Imm : Operand<i32> { let ParserMatchClass = u7ImmOperand; }
+ def u6Imm : Operand<i32> { let ParserMatchClass = u6ImmOperand; }
+ def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
+ def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
+ def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
+ def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
+ def u5Imm : Operand<i32> { let ParserMatchClass = u5ImmOperand; }
+ def u5_0Imm : Operand<i32>;
+ def u5_1Imm : Operand<i32>;
def u5_2Imm : Operand<i32>;
def u5_3Imm : Operand<i32>;
- def u4Imm : Operand<i32>;
+ def u4Imm : Operand<i32> { let ParserMatchClass = u4ImmOperand; }
def u4_0Imm : Operand<i32>;
+ def u4_1Imm : Operand<i32>;
def u4_2Imm : Operand<i32>;
- def u3Imm : Operand<i32>;
+ def u4_3Imm : Operand<i32>;
+ def u3Imm : Operand<i32> { let ParserMatchClass = u3ImmOperand; }
def u3_0Imm : Operand<i32>;
def u3_1Imm : Operand<i32>;
- def u2Imm : Operand<i32>;
- def u1Imm : Operand<i32>;
- def n8Imm : Operand<i32>;
- def m6Imm : Operand<i32>;
+ def u3_2Imm : Operand<i32>;
+ def u3_3Imm : Operand<i32>;
+ def u2Imm : Operand<i32> { let ParserMatchClass = u2ImmOperand; }
+ def u1Imm : Operand<i32> { let ParserMatchClass = u1ImmOperand; }
+ def n8Imm : Operand<i32> { let ParserMatchClass = n8ImmOperand; }
}
-let PrintMethod = "printNOneImmOperand" in
-def nOneImm : Operand<i32>;
+let OperandType = "OPERAND_IMMEDIATE" in {
+ def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
+ let PrintMethod = "prints4_6ImmOperand";
+ let DecoderMethod = "s4_6ImmDecoder";}
+ def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
+ let DecoderMethod = "s4_6ImmDecoder";}
+ def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
+ let PrintMethod = "prints3_6ImmOperand";
+ let DecoderMethod = "s3_6ImmDecoder";}
+ def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
+ let DecoderMethod = "s3_6ImmDecoder";}
+}
//
// Immediate predicates
@@ -81,32 +136,12 @@ def s31_1ImmPred : PatLeaf<(i32 imm), [{
def s30_2ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<31,1>(v);
+ return isShiftedInt<30,2>(v);
}]>;
def s29_3ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<31,1>(v);
-}]>;
-
-def s22_10ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<22,10>(v);
-}]>;
-
-def s8_24ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<8,24>(v);
-}]>;
-
-def s16_16ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<16,16>(v);
-}]>;
-
-def s26_6ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isShiftedInt<26,6>(v);
+ return isShiftedInt<29,3>(v);
}]>;
def s16ImmPred : PatLeaf<(i32 imm), [{
@@ -114,16 +149,6 @@ def s16ImmPred : PatLeaf<(i32 imm), [{
return isInt<16>(v);
}]>;
-def s13ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isInt<13>(v);
-}]>;
-
-def s12ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isInt<12>(v);
-}]>;
-
def s11_0ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
return isInt<11>(v);
@@ -149,16 +174,6 @@ def s10ImmPred : PatLeaf<(i32 imm), [{
return isInt<10>(v);
}]>;
-def s9ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isInt<9>(v);
-}]>;
-
-def m9ImmPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- return isInt<9>(v) && (v != -256);
-}]>;
-
def s8ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
return isInt<8>(v);
@@ -194,7 +209,6 @@ def s4_3ImmPred : PatLeaf<(i32 imm), [{
return isShiftedInt<4,3>(v);
}]>;
-
def u64ImmPred : PatLeaf<(i64 imm), [{
// Adding "N ||" to suppress gcc unused warning.
return (N || true);
@@ -230,19 +244,19 @@ def u26_6ImmPred : PatLeaf<(i32 imm), [{
return isShiftedUInt<26,6>(v);
}]>;
-def u16ImmPred : PatLeaf<(i32 imm), [{
+def u16_0ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
return isUInt<16>(v);
}]>;
-def u16_s8ImmPred : PatLeaf<(i32 imm), [{
+def u16_1ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- return isShiftedUInt<16,8>(v);
+ return isShiftedUInt<16,1>(v);
}]>;
-def u16_0ImmPred : PatLeaf<(i32 imm), [{
+def u16_2ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- return isUInt<16>(v);
+ return isShiftedUInt<16,2>(v);
}]>;
def u11_3ImmPred : PatLeaf<(i32 imm), [{
@@ -250,6 +264,11 @@ def u11_3ImmPred : PatLeaf<(i32 imm), [{
return isShiftedUInt<11,3>(v);
}]>;
+def u10ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<10>(v);
+}]>;
+
def u9ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
return isUInt<9>(v);
@@ -321,6 +340,11 @@ def u1ImmPred : PatLeaf<(i1 imm), [{
return isUInt<1>(v);
}]>;
+def u1ImmPred32 : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ return isUInt<1>(v);
+}]>;
+
def m5BImmPred : PatLeaf<(i32 imm), [{
// m5BImmPred predicate - True if the (char) number is in range -1 .. -31
// and will fit in a 5 bit field when made positive, for use in memops.
@@ -379,7 +403,7 @@ def Clr5ImmPred : PatLeaf<(i32 imm), [{
}]>;
def SetClr5ImmPred : PatLeaf<(i32 imm), [{
- // SetClr5ImmPred predicate - True if the immediate is in range 0..31.
+ // True if the immediate is in range 0..31.
int32_t v = (int32_t)N->getSExtValue();
return (v >= 0 && v <= 31);
}]>;
@@ -404,14 +428,13 @@ def Clr4ImmPred : PatLeaf<(i32 imm), [{
}]>;
def SetClr4ImmPred : PatLeaf<(i32 imm), [{
- // SetClr4ImmPred predicate - True if the immediate is in the range 0..15.
+ // True if the immediate is in the range 0..15.
int16_t v = (int16_t)N->getSExtValue();
return (v >= 0 && v <= 15);
}]>;
def Set3ImmPred : PatLeaf<(i32 imm), [{
- // Set3ImmPred predicate - True if the number is in the series of values:
- // [ 2^0, 2^1, ... 2^7 ].
+ // True if the number is in the series of values: [ 2^0, 2^1, ... 2^7 ].
// For use in setbit immediate.
uint8_t v = (int8_t)N->getSExtValue();
// Constrain to 8 bits, and then check for single bit.
@@ -419,9 +442,7 @@ def Set3ImmPred : PatLeaf<(i32 imm), [{
}]>;
def Clr3ImmPred : PatLeaf<(i32 imm), [{
- // Clr3ImmPred predicate - True if the number is in the series of
- // bit negated values:
- // [ 2^0, 2^1, ... 2^7 ].
+ // True if the number is in the series of bit negated values: [ 2^0, 2^1, ... 2^7 ].
// For use in setbit and clrbit immediate.
uint8_t v = ~ (int8_t)N->getSExtValue();
// Constrain to 8 bits, and then check for single bit.
@@ -429,76 +450,109 @@ def Clr3ImmPred : PatLeaf<(i32 imm), [{
}]>;
def SetClr3ImmPred : PatLeaf<(i32 imm), [{
- // SetClr3ImmPred predicate - True if the immediate is in the range 0..7.
+ // True if the immediate is in the range 0..7.
int8_t v = (int8_t)N->getSExtValue();
return (v >= 0 && v <= 7);
}]>;
// Extendable immediate operands.
-
-let PrintMethod = "printExtOperand" in {
- def f32Ext : Operand<f32>;
- def s16Ext : Operand<i32> { let DecoderMethod = "s16ImmDecoder"; }
- def s12Ext : Operand<i32> { let DecoderMethod = "s12ImmDecoder"; }
- def s11_0Ext : Operand<i32> { let DecoderMethod = "s11_0ImmDecoder"; }
- def s11_1Ext : Operand<i32> { let DecoderMethod = "s11_1ImmDecoder"; }
- def s11_2Ext : Operand<i32> { let DecoderMethod = "s11_2ImmDecoder"; }
- def s11_3Ext : Operand<i32> { let DecoderMethod = "s11_3ImmDecoder"; }
- def s10Ext : Operand<i32> { let DecoderMethod = "s10ImmDecoder"; }
- def s9Ext : Operand<i32> { let DecoderMethod = "s90ImmDecoder"; }
- def s8Ext : Operand<i32> { let DecoderMethod = "s8ImmDecoder"; }
- def s7Ext : Operand<i32>;
- def s6Ext : Operand<i32> { let DecoderMethod = "s6_0ImmDecoder"; }
- def u6Ext : Operand<i32>;
- def u7Ext : Operand<i32>;
- def u8Ext : Operand<i32>;
- def u9Ext : Operand<i32>;
- def u10Ext : Operand<i32>;
- def u6_0Ext : Operand<i32>;
- def u6_1Ext : Operand<i32>;
- def u6_2Ext : Operand<i32>;
- def u6_3Ext : Operand<i32>;
+def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
+def s16ExtOperand : AsmOperandClass { let Name = "s16Ext"; }
+def s12ExtOperand : AsmOperandClass { let Name = "s12Ext"; }
+def s10ExtOperand : AsmOperandClass { let Name = "s10Ext"; }
+def s9ExtOperand : AsmOperandClass { let Name = "s9Ext"; }
+def s8ExtOperand : AsmOperandClass { let Name = "s8Ext"; }
+def s7ExtOperand : AsmOperandClass { let Name = "s7Ext"; }
+def s6ExtOperand : AsmOperandClass { let Name = "s6Ext"; }
+def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
+def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
+def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
+def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
+def u6ExtOperand : AsmOperandClass { let Name = "u6Ext"; }
+def u7ExtOperand : AsmOperandClass { let Name = "u7Ext"; }
+def u8ExtOperand : AsmOperandClass { let Name = "u8Ext"; }
+def u9ExtOperand : AsmOperandClass { let Name = "u9Ext"; }
+def u10ExtOperand : AsmOperandClass { let Name = "u10Ext"; }
+def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
+def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
+def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
+def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
+def u32MustExtOperand : AsmOperandClass { let Name = "u32MustExt"; }
+
+
+
+let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
+ DecoderMethod = "unsignedImmDecoder" in {
+ def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
+ def s16Ext : Operand<i32> { let ParserMatchClass = s16ExtOperand;
+ let DecoderMethod = "s16ImmDecoder"; }
+ def s12Ext : Operand<i32> { let ParserMatchClass = s12ExtOperand;
+ let DecoderMethod = "s12ImmDecoder"; }
+ def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
+ let DecoderMethod = "s11_0ImmDecoder"; }
+ def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
+ let DecoderMethod = "s11_1ImmDecoder"; }
+ def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
+ let DecoderMethod = "s11_2ImmDecoder"; }
+ def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
+ let DecoderMethod = "s11_3ImmDecoder"; }
+ def s10Ext : Operand<i32> { let ParserMatchClass = s10ExtOperand;
+ let DecoderMethod = "s10ImmDecoder"; }
+ def s9Ext : Operand<i32> { let ParserMatchClass = s9ExtOperand;
+ let DecoderMethod = "s90ImmDecoder"; }
+ def s8Ext : Operand<i32> { let ParserMatchClass = s8ExtOperand;
+ let DecoderMethod = "s8ImmDecoder"; }
+ def s7Ext : Operand<i32> { let ParserMatchClass = s7ExtOperand; }
+ def s6Ext : Operand<i32> { let ParserMatchClass = s6ExtOperand;
+ let DecoderMethod = "s6_0ImmDecoder"; }
+ def u6Ext : Operand<i32> { let ParserMatchClass = u6ExtOperand; }
+ def u7Ext : Operand<i32> { let ParserMatchClass = u7ExtOperand; }
+ def u8Ext : Operand<i32> { let ParserMatchClass = u8ExtOperand; }
+ def u9Ext : Operand<i32> { let ParserMatchClass = u9ExtOperand; }
+ def u10Ext : Operand<i32> { let ParserMatchClass = u10ExtOperand; }
+ def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
+ def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
+ def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
+ def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
+ def u32MustExt : Operand<i32> { let ParserMatchClass = u32MustExtOperand; }
}
-def s10ExtPred : PatLeaf<(i32 imm), [{
- int64_t v = (int64_t)N->getSExtValue();
- if (isInt<10>(v))
- return true;
- // Return true if extending this immediate is profitable and the value
- // can fit in a 32-bit signed field.
- return isConstExtProfitable(Node) && isInt<32>(v);
+def s4_7ImmPred : PatLeaf<(i32 imm), [{
+ int64_t v = (int64_t)N->getSExtValue();
+ if (HST->hasV60TOps())
+ // Return true if the immediate can fit in a 10-bit sign extended field and
+ // is 128-byte aligned.
+ return isShiftedInt<4,7>(v);
+ return false;
}]>;
-def s8ExtPred : PatLeaf<(i32 imm), [{
+def s3_7ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- if (isInt<8>(v))
- return true;
-
- // Return true if extending this immediate is profitable and the value
- // can fit in a 32-bit signed field.
- return isConstExtProfitable(Node) && isInt<32>(v);
+ if (HST->hasV60TOps())
+ // Return true if the immediate can fit in a 9-bit sign extended field and
+ // is 128-byte aligned.
+ return isShiftedInt<3,7>(v);
+ return false;
}]>;
-def u8ExtPred : PatLeaf<(i32 imm), [{
+def s4_6ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- if (isUInt<8>(v))
- return true;
-
- // Return true if extending this immediate is profitable and the value
- // can fit in a 32-bit unsigned field.
- return isConstExtProfitable(Node) && isUInt<32>(v);
+ if (HST->hasV60TOps())
+ // Return true if the immediate can fit in a 10-bit sign extended field and
+ // is 64-byte aligned.
+ return isShiftedInt<4,6>(v);
+ return false;
}]>;
-def u9ExtPred : PatLeaf<(i32 imm), [{
+def s3_6ImmPred : PatLeaf<(i32 imm), [{
int64_t v = (int64_t)N->getSExtValue();
- if (isUInt<9>(v))
- return true;
-
- // Return true if extending this immediate is profitable and the value
- // can fit in a 32-bit unsigned field.
- return isConstExtProfitable(Node) && isUInt<32>(v);
+ if (HST->hasV60TOps())
+ // Return true if the immediate can fit in a 9-bit sign extended field and
+ // is 64-byte aligned.
+ return isShiftedInt<3,6>(v);
+ return false;
}]>;
@@ -523,21 +577,21 @@ let PrintMethod = "printGlobalOperand" in {
let PrintMethod = "printJumpTable" in
def jumptablebase : Operand<i32>;
-def brtarget : Operand<OtherVT>;
+def brtarget : Operand<OtherVT> {
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
+}
def brtargetExt : Operand<OtherVT> {
- let PrintMethod = "printExtBrtarget";
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
+}
+def calltarget : Operand<i32> {
+ let DecoderMethod = "brtargetDecoder";
+ let PrintMethod = "printBrtarget";
}
-def calltarget : Operand<i32>;
def bblabel : Operand<i32>;
-def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf , [], "BasicBlockSDNode">;
-
-def symbolHi32 : Operand<i32> {
- let PrintMethod = "printSymbolHi";
-}
-def symbolLo32 : Operand<i32> {
- let PrintMethod = "printSymbolLo";
-}
+def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
// Return true if for a 32 to 64-bit sign-extended load.
def is_sext_i32 : PatLeaf<(i64 DoubleRegs:$src1), [{
diff --git a/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
new file mode 100644
index 000000000000..1723771550c9
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -0,0 +1,150 @@
+//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "Hexagon.h"
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonOptimizeSZextends();
+ void initializeHexagonOptimizeSZextendsPass(PassRegistry&);
+}
+
+namespace {
+ struct HexagonOptimizeSZextends : public FunctionPass {
+ public:
+ static char ID;
+ HexagonOptimizeSZextends() : FunctionPass(ID) {
+ initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ const char *getPassName() const override {
+ return "Remove sign extends";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineFunctionAnalysis>();
+ AU.addPreserved<MachineFunctionAnalysis>();
+ AU.addPreserved<StackProtector>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool intrinsicAlreadySextended(Intrinsic::ID IntID);
+ };
+}
+
+char HexagonOptimizeSZextends::ID = 0;
+
+INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs",
+ "Remove Sign and Zero Extends for Args", false, false)
+
+bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) {
+ switch(IntID) {
+ case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
+ unsigned Idx = 1;
+ // Try to optimize sign extends in formal parameters. It's relying on
+ // callee already sign extending the values. I'm not sure if our ABI
+ // requires callee to sign extend though.
+ for (auto &Arg : F.args()) {
+ if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+ if (!isa<PointerType>(Arg.getType())) {
+ for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
+ if (isa<SExtInst>(*UI)) {
+ Instruction* Use = cast<Instruction>(*UI);
+ SExtInst* SI = new SExtInst(&Arg, Use->getType());
+ assert (EVT::getEVT(SI->getType()) ==
+ (EVT::getEVT(Use->getType())));
+ ++UI;
+ Use->replaceAllUsesWith(SI);
+ Instruction* First = &F.getEntryBlock().front();
+ SI->insertBefore(First);
+ Use->eraseFromParent();
+ } else {
+ ++UI;
+ }
+ }
+ }
+ }
+ ++Idx;
+ }
+
+ // Try to remove redundant sext operations on Hexagon. The hardware
+ // already sign extends many 16 bit intrinsic operations to 32 bits.
+ // For example:
+ // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y)
+ // %sext233 = shl i32 %34, 16
+ // %conv52 = ashr exact i32 %sext233, 16
+ for (auto &B : F) {
+ for (auto &I : B) {
+ // Look for arithmetic shift right by 16.
+ BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I);
+ if (!(Ashr && Ashr->getOpcode() == Instruction::AShr))
+ continue;
+ Value *AshrOp1 = Ashr->getOperand(1);
+ ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1);
+ // Right shifted by 16.
+ if (!(C && C->getSExtValue() == 16))
+ continue;
+
+ // The first operand of Ashr comes from logical shift left.
+ Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0));
+ if (!(Shl && Shl->getOpcode() == Instruction::Shl))
+ continue;
+ Value *Intr = Shl->getOperand(0);
+ Value *ShlOp1 = Shl->getOperand(1);
+ C = dyn_cast<ConstantInt>(ShlOp1);
+ // Left shifted by 16.
+ if (!(C && C->getSExtValue() == 16))
+ continue;
+
+ // The first operand of Shl comes from an intrinsic.
+ if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) {
+ if (!intrinsicAlreadySextended(I->getIntrinsicID()))
+ continue;
+ // All is well. Replace all uses of AShr with I.
+ for (auto UI = Ashr->user_begin(), UE = Ashr->user_end();
+ UI != UE; ++UI) {
+ const Use &TheUse = UI.getUse();
+ if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) {
+ J->replaceUsesOfWith(Ashr, I);
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+
+FunctionPass *llvm::createHexagonOptimizeSZextends() {
+ return new HexagonOptimizeSZextends();
+}
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 93dcbe233b25..e68ff85b1da6 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -124,7 +124,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
// Loop over all of the basic blocks.
for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
PeepholeMap.clear();
PeepholeDoubleRegsMap.clear();
@@ -180,7 +180,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
unsigned DstReg = Dst.getReg();
unsigned SrcReg = Src1.getReg();
PeepholeDoubleRegsMap[DstReg] =
- std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+ std::make_pair(*&SrcReg, Hexagon::subreg_hireg);
}
// Look for P=NOT(P).
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index f6bb4a045438..61c0589fb5bf 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -66,6 +66,8 @@ HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF) const {
switch (HST.getHexagonArchVersion()) {
case HexagonSubtarget::V4:
case HexagonSubtarget::V5:
+ case HexagonSubtarget::V55:
+ case HexagonSubtarget::V60:
return CallerSavedRegsV4;
}
llvm_unreachable(
@@ -84,6 +86,8 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
case HexagonSubtarget::V4:
case HexagonSubtarget::V5:
+ case HexagonSubtarget::V55:
+ case HexagonSubtarget::V60:
return CalleeSavedRegsV3;
}
llvm_unreachable("Callee saved registers requested for unknown architecture "
@@ -98,7 +102,7 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
Reserved.set(Hexagon::R29);
Reserved.set(Hexagon::R30);
Reserved.set(Hexagon::R31);
- Reserved.set(Hexagon::D14);
+ Reserved.set(Hexagon::PC);
Reserved.set(Hexagon::D15);
Reserved.set(Hexagon::LC0);
Reserved.set(Hexagon::LC1);
@@ -116,62 +120,21 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert(SPAdj == 0 && "Unexpected");
MachineInstr &MI = *II;
-
MachineBasicBlock &MB = *MI.getParent();
MachineFunction &MF = *MB.getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
- auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
+ auto &HST = MF.getSubtarget<HexagonSubtarget>();
auto &HII = *HST.getInstrInfo();
auto &HFI = *HST.getFrameLowering();
+ unsigned BP = 0;
int FI = MI.getOperand(FIOp).getIndex();
- int Offset = MFI.getObjectOffset(FI) + MI.getOperand(FIOp+1).getImm();
- bool HasAlloca = MFI.hasVarSizedObjects();
- bool HasAlign = needsStackRealignment(MF);
-
- // XXX: Fixed objects cannot be accessed through SP if there are aligned
- // objects in the local frame, or if there are dynamically allocated objects.
- // In such cases, there has to be FP available.
- if (!HFI.hasFP(MF)) {
- assert(!HasAlloca && !HasAlign && "This function must have frame pointer");
- // We will not reserve space on the stack for the lr and fp registers.
- Offset -= 8;
- }
-
- unsigned SP = getStackRegister(), FP = getFrameRegister();
- unsigned AP = 0;
- if (MachineInstr *AI = HFI.getAlignaInstr(MF))
- AP = AI->getOperand(0).getReg();
- unsigned FrameSize = MFI.getStackSize();
-
- // Special handling of dbg_value instructions and INLINEASM.
- if (MI.isDebugValue() || MI.isInlineAsm()) {
- MI.getOperand(FIOp).ChangeToRegister(SP, false /*isDef*/);
- MI.getOperand(FIOp+1).ChangeToImmediate(Offset+FrameSize);
- return;
- }
-
- bool UseFP = false, UseAP = false; // Default: use SP.
- if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
- UseFP = HasAlloca || HasAlign;
- } else {
- if (HasAlloca) {
- if (HasAlign)
- UseAP = true;
- else
- UseFP = true;
- }
- }
+ // Select the base pointer (BP) and calculate the actual offset from BP
+ // to the beginning of the object at index FI.
+ int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+ // Add the offset from the instruction.
+ int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
unsigned Opc = MI.getOpcode();
- bool ValidSP = HII.isValidOffset(Opc, FrameSize+Offset);
- bool ValidFP = HII.isValidOffset(Opc, Offset);
-
- // Calculate the actual offset in the instruction.
- int64_t RealOffset = Offset;
- if (!UseFP && !UseAP)
- RealOffset = FrameSize+Offset;
-
switch (Opc) {
case Hexagon::TFR_FIA:
MI.setDesc(HII.get(Hexagon::A2_addi));
@@ -184,20 +147,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
break;
}
- unsigned BP = 0;
- bool Valid = false;
- if (UseFP) {
- BP = FP;
- Valid = ValidFP;
- } else if (UseAP) {
- BP = AP;
- Valid = ValidFP;
- } else {
- BP = SP;
- Valid = ValidSP;
- }
-
- if (Valid) {
+ if (HII.isValidOffset(Opc, RealOffset)) {
MI.getOperand(FIOp).ChangeToRegister(BP, false);
MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
return;
@@ -223,8 +173,8 @@ unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
&MF) const {
const HexagonFrameLowering *TFI = getFrameLowering(MF);
if (TFI->hasFP(MF))
- return Hexagon::R30;
- return Hexagon::R29;
+ return getFrameRegister();
+ return getStackRegister();
}
@@ -238,17 +188,9 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
}
-bool
-HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
- const HexagonFrameLowering *TFI = getFrameLowering(MF);
- return TFI->hasFP(MF);
-}
-
-
-bool
-HexagonRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- return MFI->getMaxAlignment() > 8;
+bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
+ const {
+ return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
}
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 7edefee93993..db7e0f27815d 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -63,8 +63,6 @@ public:
return true;
}
- bool needsStackRealignment(const MachineFunction &MF) const override;
-
/// Returns true if the frame pointer is valid.
bool useFPForScavengingIndex(const MachineFunction &MF) const override;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index edf1c251ac77..81629dc6d47f 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -53,6 +53,12 @@ let Namespace = "Hexagon" in {
let Num = num;
}
+
+ // Rq - vector predicate registers
+ class Rq<bits<3> num, string n> : Register<n, []> {
+ let HWEncoding{2-0} = num;
+ }
+
// Rc - control registers
class Rc<bits<5> num, string n,
list<string> alt = [], list<Register> alias = []> :
@@ -131,20 +137,21 @@ let Namespace = "Hexagon" in {
def LC1 : Rc<3, "lc1", ["c3"]>, DwarfRegNum<[70]>;
def P3_0 : Rc<4, "p3:0", ["c4"], [P0, P1, P2, P3]>,
DwarfRegNum<[71]>;
- def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[72]>;
- def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[73]>;
+ def C5 : Rc<5, "c5", ["c5"]>, DwarfRegNum<[72]>; // future use
+ def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[73]>;
+ def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[74]>;
- def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[74]> {
+ def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[75]> {
let SubRegIndices = [subreg_overflow];
let SubRegs = [USR_OVF];
}
- def PC : Rc<9, "pc">, DwarfRegNum<[75]>;
- def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[76]>;
- def GP : Rc<11, "gp">, DwarfRegNum<[77]>;
- def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[78]>;
- def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[79]>;
- def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[80]>;
- def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[81]>;
+ def PC : Rc<9, "pc">, DwarfRegNum<[76]>;
+ def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[77]>;
+ def GP : Rc<11, "gp">, DwarfRegNum<[78]>;
+ def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[79]>;
+ def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[80]>;
+ def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[81]>;
+ def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[82]>;
}
// Control registers pairs.
@@ -158,6 +165,36 @@ let Namespace = "Hexagon" in {
def UPC : Rcc<14, "c15:14", [UPCL, UPCH]>, DwarfRegNum<[80]>;
}
+ foreach i = 0-31 in {
+ def V#i : Ri<i, "v"#i>, DwarfRegNum<[!add(i, 99)]>;
+ }
+
+ // Aliases of the V* registers used to hold double vec values.
+ let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
+ def W0 : Rd< 0, "v1:0", [V0, V1]>, DwarfRegNum<[99]>;
+ def W1 : Rd< 2, "v3:2", [V2, V3]>, DwarfRegNum<[101]>;
+ def W2 : Rd< 4, "v5:4", [V4, V5]>, DwarfRegNum<[103]>;
+ def W3 : Rd< 6, "v7:6", [V6, V7]>, DwarfRegNum<[105]>;
+ def W4 : Rd< 8, "v9:8", [V8, V9]>, DwarfRegNum<[107]>;
+ def W5 : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
+ def W6 : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
+ def W7 : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
+ def W8 : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
+ def W9 : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
+ def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
+ def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
+ def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
+ def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
+ def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
+ def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+ }
+
+ // Vector Predicate registers.
+ def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>;
+ def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
+ def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
+ def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
// Register classes.
//
// FIXME: the register order should be defined in terms of the preferred
@@ -169,10 +206,34 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
R10, R11, R29, R30, R31)> {
}
+// Registers are listed in reverse order for allocation preference reasons.
+def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
+ (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+
def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
(add (sequence "D%u", 0, 4),
(sequence "D%u", 6, 13), D5, D14, D15)>;
+def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
+ (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs : RegisterClass<"Hexagon",
+ [v128i8, v64i16, v32i32, v16i64], 1024,
+ (add (sequence "W%u", 0, 15))>;
+
+def VectorRegs128B : RegisterClass<"Hexagon",
+ [v128i8, v64i16, v32i32, v16i64], 1024,
+ (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs128B : RegisterClass<"Hexagon",
+ [v256i8,v128i16,v64i32,v32i64], 2048,
+ (add (sequence "W%u", 0, 15))>;
+
+def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512,
+ (add (sequence "Q%u", 0, 3))>;
+
+def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024,
+ (add (sequence "Q%u", 0, 3))>;
def PredRegs : RegisterClass<"Hexagon",
[i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
deleted file mode 100644
index 7069ad36e21a..000000000000
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Pass that removes sign extends for function parameters. These parameters
-// are already sign extended by the caller per Hexagon's ABI
-//
-//===----------------------------------------------------------------------===//
-
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/StackProtector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-
-using namespace llvm;
-
-namespace llvm {
- FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
- void initializeHexagonRemoveExtendArgsPass(PassRegistry&);
-}
-
-namespace {
- struct HexagonRemoveExtendArgs : public FunctionPass {
- public:
- static char ID;
- HexagonRemoveExtendArgs() : FunctionPass(ID) {
- initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
-
- const char *getPassName() const override {
- return "Remove sign extends";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineFunctionAnalysis>();
- AU.addPreserved<MachineFunctionAnalysis>();
- AU.addPreserved<StackProtector>();
- FunctionPass::getAnalysisUsage(AU);
- }
- };
-}
-
-char HexagonRemoveExtendArgs::ID = 0;
-
-INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs",
- "Remove Sign and Zero Extends for Args", false, false)
-
-bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
- unsigned Idx = 1;
- for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
- ++AI, ++Idx) {
- if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
- Argument* Arg = AI;
- if (!isa<PointerType>(Arg->getType())) {
- for (auto UI = Arg->user_begin(); UI != Arg->user_end();) {
- if (isa<SExtInst>(*UI)) {
- Instruction* I = cast<Instruction>(*UI);
- SExtInst* SI = new SExtInst(Arg, I->getType());
- assert (EVT::getEVT(SI->getType()) ==
- (EVT::getEVT(I->getType())));
- ++UI;
- I->replaceAllUsesWith(SI);
- Instruction* First = F.getEntryBlock().begin();
- SI->insertBefore(First);
- I->eraseFromParent();
- } else {
- ++UI;
- }
- }
- }
- }
- }
- return true;
-}
-
-
-
-FunctionPass*
-llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) {
- return new HexagonRemoveExtendArgs();
-}
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index 528cafc2bfea..6e4987b7e4e3 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -13,6 +13,12 @@
include "HexagonScheduleV4.td"
+// V55 Machine Info +
+include "HexagonScheduleV55.td"
+
//===----------------------------------------------------------------------===//
-// V4 Machine Info -
+// V60 Machine Info -
//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV60.td"
+
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index a7d2d4724d0b..67af147b25b3 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -35,10 +35,11 @@ def SLOT_ENDLOOP: FuncUnit;
// Itinerary classes.
def PSEUDO : InstrItinClass;
-def PSEUDOM : InstrItinClass;
+def PSEUDOM : InstrItinClass;
// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
def DUPLEX : InstrItinClass;
def PREFIX : InstrItinClass;
+def COMPOUND_CJ_ARCHDEPSLOT : InstrItinClass;
def COMPOUND : InstrItinClass;
def ALU32_2op_tc_1_SLOT0123 : InstrItinClass;
@@ -58,6 +59,7 @@ def CR_tc_2early_SLOT3 : InstrItinClass;
def CR_tc_3x_SLOT23 : InstrItinClass;
def CR_tc_3x_SLOT3 : InstrItinClass;
def J_tc_2early_SLOT23 : InstrItinClass;
+def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT : InstrItinClass;
def J_tc_2early_SLOT2 : InstrItinClass;
def LD_tc_ld_SLOT01 : InstrItinClass;
def LD_tc_ld_SLOT0 : InstrItinClass;
@@ -91,6 +93,7 @@ def V4LDST_tc_st_SLOT0 : InstrItinClass;
def V4LDST_tc_st_SLOT01 : InstrItinClass;
def J_tc_2early_SLOT0123 : InstrItinClass;
def EXTENDER_tc_1_SLOT0123 : InstrItinClass;
+def S_3op_tc_3stall_SLOT23 : InstrItinClass;
def HexagonItinerariesV4 :
diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td
new file mode 100644
index 000000000000..d9ad25d4cd5a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -0,0 +1,170 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+// |===========|==================================================|
+// | PIPELINE | Instruction Classes |
+// |===========|==================================================|
+// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM |
+// |-----------|--------------------------------------------------|
+// | SLOT1 | LD ST ALU32 |
+// |-----------|--------------------------------------------------|
+// | SLOT2 | XTYPE ALU32 J JR |
+// |-----------|--------------------------------------------------|
+// | SLOT3 | XTYPE ALU32 J CR |
+// |===========|==================================================|
+
+def CJ_tc_1_SLOT23 : InstrItinClass;
+def CJ_tc_2early_SLOT23 : InstrItinClass;
+def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass;
+def COPROC_VX_vtc_long_SLOT23 : InstrItinClass;
+def COPROC_VX_vtc_SLOT23 : InstrItinClass;
+def J_tc_3stall_SLOT2 : InstrItinClass;
+def MAPPING_tc_1_SLOT0123 : InstrItinClass;
+def M_tc_3stall_SLOT23 : InstrItinClass;
+def SUBINSN_tc_1_SLOT01 : InstrItinClass;
+def SUBINSN_tc_2early_SLOT0 : InstrItinClass;
+def SUBINSN_tc_2early_SLOT01 : InstrItinClass;
+def SUBINSN_tc_3stall_SLOT0 : InstrItinClass;
+def SUBINSN_tc_ld_SLOT0 : InstrItinClass;
+def SUBINSN_tc_ld_SLOT01 : InstrItinClass;
+def SUBINSN_tc_st_SLOT01 : InstrItinClass;
+
+def HexagonItinerariesV55 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+ // ALU32
+ InstrItinData<ALU32_2op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2_SLOT0123 ,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_ADDI_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // ALU64
+ InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // CR -> System
+ InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>,
+
+ // Jump (conditional/unconditional/return etc)
+ InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // JR
+ InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>,
+ InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>,
+
+ // Extender
+ InstrItinData<EXTENDER_tc_1_SLOT0123,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Load
+ InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+
+ // M
+ InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // Store
+ InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // Subinsn
+ InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_2early_SLOT01,
+ [InstrStage<2, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ // S
+ InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // New Value Compare Jump
+ InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+
+ // Mem ops
+ InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+ InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ // Endloop
+ InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+ // Vector
+ InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+ [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<COPROC_VX_vtc_long_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<COPROC_VX_vtc_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<MAPPING_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Misc
+ InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [SLOT2, SLOT3]>]>
+
+ ]>;
+
+def HexagonModelV55 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV55;
+ let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
new file mode 100644
index 000000000000..2ccff8242a47
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -0,0 +1,310 @@
+//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
+def CVI_ST : FuncUnit;
+def CVI_XLANE : FuncUnit;
+def CVI_SHIFT : FuncUnit;
+def CVI_MPY0 : FuncUnit;
+def CVI_MPY1 : FuncUnit;
+def CVI_LD : FuncUnit;
+
+// Combined functional units.
+def CVI_XLSHF : FuncUnit;
+def CVI_MPY01 : FuncUnit;
+def CVI_ALL : FuncUnit;
+
+// Combined functional unit data.
+def HexagonComboFuncsV60 :
+ ComboFuncUnits<[
+ ComboFuncData<CVI_XLSHF , [CVI_XLANE, CVI_SHIFT]>,
+ ComboFuncData<CVI_MPY01 , [CVI_MPY0, CVI_MPY1]>,
+ ComboFuncData<CVI_ALL , [CVI_ST, CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1, CVI_LD]>
+ ]>;
+
+// Note: When adding additional vector scheduling classes, add the
+// corresponding methods to the class HexagonInstrInfo.
+def CVI_VA : InstrItinClass;
+def CVI_VA_DV : InstrItinClass;
+def CVI_VX_LONG : InstrItinClass;
+def CVI_VX_LATE : InstrItinClass;
+def CVI_VX : InstrItinClass;
+def CVI_VX_DV_LONG : InstrItinClass;
+def CVI_VX_DV : InstrItinClass;
+def CVI_VX_DV_SLOT2 : InstrItinClass;
+def CVI_VP : InstrItinClass;
+def CVI_VP_LONG : InstrItinClass;
+def CVI_VP_VS_EARLY : InstrItinClass;
+def CVI_VP_VS_LONG_EARLY : InstrItinClass;
+def CVI_VP_VS_LONG : InstrItinClass;
+def CVI_VP_VS : InstrItinClass;
+def CVI_VP_DV : InstrItinClass;
+def CVI_VS : InstrItinClass;
+def CVI_VINLANESAT : InstrItinClass;
+def CVI_VM_LD : InstrItinClass;
+def CVI_VM_TMP_LD : InstrItinClass;
+def CVI_VM_CUR_LD : InstrItinClass;
+def CVI_VM_VP_LDU : InstrItinClass;
+def CVI_VM_ST : InstrItinClass;
+def CVI_VM_NEW_ST : InstrItinClass;
+def CVI_VM_STU : InstrItinClass;
+def CVI_HIST : InstrItinClass;
+def CVI_VA_EXT : InstrItinClass;
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine.
+// This file describes that machine information.
+//
+// |===========|==================================================|
+// | PIPELINE | Instruction Classes |
+// |===========|==================================================|
+// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM |
+// |-----------|--------------------------------------------------|
+// | SLOT1 | LD ST ALU32 |
+// |-----------|--------------------------------------------------|
+// | SLOT2 | XTYPE ALU32 J JR |
+// |-----------|--------------------------------------------------|
+// | SLOT3 | XTYPE ALU32 J CR |
+// |===========|==================================================|
+//
+//
+// In addition to using the above SLOTS, there are also six vector pipelines
+// in the CVI co-processor in the Hexagon V60 machine.
+//
+// |=========| |=========| |=========| |=========| |=========| |=========|
+// SLOT | CVI_LD | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST |
+// ==== |=========| |=========| |=========| |=========| |=========| |=========|
+// S0-3 | | | CVI_VA | | CVI_VA | | CVI_VA | | CVI_VA | | |
+// S2-3 | | | CVI_VX | | CVI_VX | | | | | | |
+// S0-3 | | | | | | | | | CVI_VP | | |
+// S0-3 | | | | | | | CVI_VS | | | | |
+// S0-1 |(CVI_LD) | | CVI_LD | | CVI_LD | | CVI_LD | | CVI_LD | | |
+// S0-1 |(C*TMP_LD) | | | | | | | | | |
+// S01 |(C*_LDU) | | | | | | | | C*_LDU | | |
+// S0 | | | CVI_ST | | CVI_ST | | CVI_ST | | CVI_ST | |(CVI_ST) |
+// S0 | | | | | | | | | | |(C*TMP_ST)
+// S01 | | | | | | | | | VSTU | |(C*_STU) |
+// |=========| |=========| |=========| |=========| |=========| |=========|
+// |=====================| |=====================|
+// | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT|
+// |=====================| |=====================|
+// S0-3 | CVI_VA_DV | | CVI_VA_DV |
+// S0-3 | | | CVI_VP_DV |
+// S2-3 | CVI_VX_DV | | |
+// |=====================| |=====================|
+// |=====================================================================|
+// S0-3 | CVI_HIST Histogram |
+// S0123| CVI_VA_EXT Extract |
+// |=====================================================================|
+
+def HexagonItinerariesV60 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+ CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+ CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [
+ // ALU32
+ InstrItinData<ALU32_2op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2_SLOT0123 ,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+ [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<ALU32_ADDI_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // ALU64
+ InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // CR -> System
+ InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>,
+
+ // Jump (conditional/unconditional/return etc)
+ InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // JR
+ InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>,
+ InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>,
+
+ // Extender
+ InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+ [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Load
+ InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+ InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+
+ // M
+ InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+ InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // Store
+ InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+ // Subinsn
+ InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>,
+ InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_2early_SLOT01,
+ [InstrStage<2, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ // S
+ InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+ InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+ // New Value Compare Jump
+ InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+
+ // Mem ops
+ InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+ InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+ // Endloop
+ InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+ // Vector
+ InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+ [InstrStage<3, [SLOT0, SLOT1]>]>,
+ InstrItinData<COPROC_VX_vtc_long_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<COPROC_VX_vtc_SLOT23 ,
+ [InstrStage<3, [SLOT2, SLOT3]>]>,
+ InstrItinData<MAPPING_tc_1_SLOT0123 ,
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+ // Duplex and Compound
+ InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+ InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+ // Misc
+ InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+ InstrItinData<PSEUDOM , [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [SLOT2, SLOT3]>]>,
+
+ // Latest CVI spec definitions.
+ InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VA_DV,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VX_DV_LONG,
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_DV,
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VX_DV_SLOT2,
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>]>,
+ InstrItinData<CVI_VP, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VP_VS_EARLY,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS_LONG,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_VS_LONG_EARLY,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VP_DV , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>]>,
+ InstrItinData<CVI_VS,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>]>,
+ InstrItinData<CVI_VINLANESAT,
+ [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>]>,
+ InstrItinData<CVI_VM_LD , [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>]>,
+ InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_VM_ST , [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+ CVI_MPY0, CVI_MPY1]>]>,
+ InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>]>,
+ InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>]>,
+ InstrItinData<CVI_HIST , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>]>
+ ]>;
+
+def HexagonModelV60 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV60;
+ let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V60 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 276cc69eed0f..239dbda8f27b 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -12,12 +12,11 @@
//===----------------------------------------------------------------------===//
#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
using namespace llvm;
#define DEBUG_TYPE "hexagon-selectiondag-info"
-bool llvm::flag_aligned_memcpy;
-
SDValue
HexagonSelectionDAGInfo::
EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
@@ -25,15 +24,40 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const {
- flag_aligned_memcpy = false;
- if ((Align & 0x3) == 0) {
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
- if (ConstantSize) {
- uint64_t SizeVal = ConstantSize->getZExtValue();
- if ((SizeVal > 32) && ((SizeVal % 8) == 0))
- flag_aligned_memcpy = true;
- }
- }
-
- return SDValue();
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
+ return SDValue();
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (SizeVal < 32 || (SizeVal % 8) != 0)
+ return SDValue();
+
+ // Special case aligned memcpys with size >= 32 bytes and a multiple of 8.
+ //
+ const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Entry.Node = Dst;
+ Args.push_back(Entry);
+ Entry.Node = Src;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+
+ const char *SpecialMemcpyName =
+ "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+ Type::getVoidTy(*DAG.getContext()),
+ DAG.getTargetExternalSymbol(
+ SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout())),
+ std::move(Args), 0)
+ .setDiscardResult();
+
+ std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+ return CallResult.second;
}
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index d3eb56f4ba0f..10fe606985dd 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -81,7 +81,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
// Loop over all of the basic blocks
for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
MBBb != MBBe; ++MBBb) {
- MachineBasicBlock* MBB = MBBb;
+ MachineBasicBlock *MBB = &*MBBb;
// Traverse the basic block
MachineBasicBlock::iterator MII = MBB->begin();
MachineBasicBlock::iterator MIE = MBB->end ();
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
new file mode 100644
index 000000000000..d4e95b0d0210
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -0,0 +1,1209 @@
+//===--- HexagonSplitDouble.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hsdr"
+
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonSplitDoubleRegs();
+ void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+}
+
+namespace {
+ static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
+ cl::desc("Maximum number of split partitions"));
+ static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
+ cl::desc("Do not split loads or stores"));
+
+ class HexagonSplitDoubleRegs : public MachineFunctionPass {
+ public:
+ static char ID;
+ HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr),
+ TII(nullptr) {
+ initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
+ }
+ const char *getPassName() const override {
+ return "Hexagon Split Double Registers";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ private:
+ static const TargetRegisterClass *const DoubleRC;
+
+ const HexagonRegisterInfo *TRI;
+ const HexagonInstrInfo *TII;
+ const MachineLoopInfo *MLI;
+ MachineRegisterInfo *MRI;
+
+ typedef std::set<unsigned> USet;
+ typedef std::map<unsigned,USet> UUSetMap;
+ typedef std::pair<unsigned,unsigned> UUPair;
+ typedef std::map<unsigned,UUPair> UUPairMap;
+ typedef std::map<const MachineLoop*,USet> LoopRegMap;
+
+ bool isInduction(unsigned Reg, LoopRegMap &IRM) const;
+ bool isVolatileInstr(const MachineInstr *MI) const;
+ bool isFixedInstr(const MachineInstr *MI) const;
+ void partitionRegisters(UUSetMap &P2Rs);
+ int32_t profit(const MachineInstr *MI) const;
+ bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
+
+ void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
+ void collectIndRegs(LoopRegMap &IRM);
+
+ void createHalfInstr(unsigned Opc, MachineInstr *MI,
+ const UUPairMap &PairMap, unsigned SubR);
+ void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitCombine(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitExt(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitShift(MachineInstr *MI, const UUPairMap &PairMap);
+ void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap);
+ bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap);
+ void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap);
+ void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap);
+ bool splitPartition(const USet &Part);
+
+ static int Counter;
+ static void dump_partition(raw_ostream&, const USet&,
+ const TargetRegisterInfo&);
+ };
+ char HexagonSplitDoubleRegs::ID;
+ int HexagonSplitDoubleRegs::Counter = 0;
+ const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC
+ = &Hexagon::DoubleRegsRegClass;
+}
+
+INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
+ "Hexagon Split Double Registers", false, false)
+
+
+static inline uint32_t getRegState(const MachineOperand &R) {
+ assert(R.isReg());
+ return getDefRegState(R.isDef()) |
+ getImplRegState(R.isImplicit()) |
+ getKillRegState(R.isKill()) |
+ getDeadRegState(R.isDead()) |
+ getUndefRegState(R.isUndef()) |
+ getInternalReadRegState(R.isInternalRead()) |
+ (R.isDebug() ? RegState::Debug : 0);
+}
+
+
+void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+ const USet &Part, const TargetRegisterInfo &TRI) {
+ dbgs() << '{';
+ for (auto I : Part)
+ dbgs() << ' ' << PrintReg(I, &TRI);
+ dbgs() << " }";
+}
+
+
+bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
+ for (auto I : IRM) {
+ const USet &Rs = I.second;
+ if (Rs.find(Reg) != Rs.end())
+ return true;
+ }
+ return false;
+}
+
+
+bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const {
+ for (auto &I : MI->memoperands())
+ if (I->isVolatile())
+ return true;
+ return false;
+}
+
+
+bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
+ if (MI->mayLoad() || MI->mayStore())
+ if (MemRefsFixed || isVolatileInstr(MI))
+ return true;
+ if (MI->isDebugValue())
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ return true;
+
+ case TargetOpcode::PHI:
+ case TargetOpcode::COPY:
+ break;
+
+ case Hexagon::L2_loadrd_io:
+ // Not handling stack stores (only reg-based addresses).
+ if (MI->getOperand(1).isReg())
+ break;
+ return true;
+ case Hexagon::S2_storerd_io:
+ // Not handling stack stores (only reg-based addresses).
+ if (MI->getOperand(0).isReg())
+ break;
+ return true;
+ case Hexagon::L2_loadrd_pi:
+ case Hexagon::S2_storerd_pi:
+
+ case Hexagon::A2_tfrpi:
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineir:
+ case Hexagon::A4_combineii:
+ case Hexagon::A4_combineri:
+ case Hexagon::A2_combinew:
+ case Hexagon::CONST64_Int_Real:
+
+ case Hexagon::A2_sxtw:
+
+ case Hexagon::A2_andp:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_xorp:
+ case Hexagon::S2_asl_i_p_or:
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_lsr_i_p:
+ break;
+ }
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ return true;
+ }
+ return false;
+}
+
+
+void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
+ typedef std::map<unsigned,unsigned> UUMap;
+ typedef std::vector<unsigned> UVect;
+
+ unsigned NumRegs = MRI->getNumVirtRegs();
+ BitVector DoubleRegs(NumRegs);
+ for (unsigned i = 0; i < NumRegs; ++i) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(i);
+ if (MRI->getRegClass(R) == DoubleRC)
+ DoubleRegs.set(i);
+ }
+
+ BitVector FixedRegs(NumRegs);
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ MachineInstr *DefI = MRI->getVRegDef(R);
+ // In some cases a register may exist, but never be defined or used.
+ // It should never appear anywhere, but mark it as "fixed", just to be
+ // safe.
+ if (!DefI || isFixedInstr(DefI))
+ FixedRegs.set(x);
+ }
+
+ UUSetMap AssocMap;
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ if (FixedRegs[x])
+ continue;
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ DEBUG(dbgs() << PrintReg(R, TRI) << " ~~");
+ USet &Asc = AssocMap[R];
+ for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
+ U != Z; ++U) {
+ MachineOperand &Op = *U;
+ MachineInstr *UseI = Op.getParent();
+ if (isFixedInstr(UseI))
+ continue;
+ for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) {
+ MachineOperand &MO = UseI->getOperand(i);
+ // Skip non-registers or registers with subregisters.
+ if (&MO == &Op || !MO.isReg() || MO.getSubReg())
+ continue;
+ unsigned T = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(T)) {
+ FixedRegs.set(x);
+ continue;
+ }
+ if (MRI->getRegClass(T) != DoubleRC)
+ continue;
+ unsigned u = TargetRegisterInfo::virtReg2Index(T);
+ if (FixedRegs[u])
+ continue;
+ DEBUG(dbgs() << ' ' << PrintReg(T, TRI));
+ Asc.insert(T);
+ // Make it symmetric.
+ AssocMap[T].insert(R);
+ }
+ }
+ DEBUG(dbgs() << '\n');
+ }
+
+ UUMap R2P;
+ unsigned NextP = 1;
+ USet Visited;
+ for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+ unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ if (Visited.count(R))
+ continue;
+ // Create a new partition for R.
+ unsigned ThisP = FixedRegs[x] ? 0 : NextP++;
+ UVect WorkQ;
+ WorkQ.push_back(R);
+ for (unsigned i = 0; i < WorkQ.size(); ++i) {
+ unsigned T = WorkQ[i];
+ if (Visited.count(T))
+ continue;
+ R2P[T] = ThisP;
+ Visited.insert(T);
+ // Add all registers associated with T.
+ USet &Asc = AssocMap[T];
+ for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J)
+ WorkQ.push_back(*J);
+ }
+ }
+
+ for (auto I : R2P)
+ P2Rs[I.second].insert(I.first);
+}
+
+
+static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+ int32_t P = 0;
+ bool LoZ1 = false, HiZ1 = false;
+ if (Lo == 0 || Lo == 0xFFFFFFFF)
+ P += 10, LoZ1 = true;
+ if (Hi == 0 || Hi == 0xFFFFFFFF)
+ P += 10, HiZ1 = true;
+ if (!LoZ1 && !HiZ1 && Lo == Hi)
+ P += 3;
+ return P;
+}
+
+
+int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
+ unsigned ImmX = 0;
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case TargetOpcode::PHI:
+ for (const auto &Op : MI->operands())
+ if (!Op.getSubReg())
+ return 0;
+ return 10;
+ case TargetOpcode::COPY:
+ if (MI->getOperand(1).getSubReg() != 0)
+ return 10;
+ return 0;
+
+ case Hexagon::L2_loadrd_io:
+ case Hexagon::S2_storerd_io:
+ return -1;
+ case Hexagon::L2_loadrd_pi:
+ case Hexagon::S2_storerd_pi:
+ return 2;
+
+ case Hexagon::A2_tfrpi:
+ case Hexagon::CONST64_Int_Real: {
+ uint64_t D = MI->getOperand(1).getImm();
+ unsigned Lo = D & 0xFFFFFFFFULL;
+ unsigned Hi = D >> 32;
+ return profitImm(Lo, Hi);
+ }
+ case Hexagon::A2_combineii:
+ case Hexagon::A4_combineii:
+ return profitImm(MI->getOperand(1).getImm(),
+ MI->getOperand(2).getImm());
+ case Hexagon::A4_combineri:
+ ImmX++;
+ case Hexagon::A4_combineir: {
+ ImmX++;
+ int64_t V = MI->getOperand(ImmX).getImm();
+ if (V == 0 || V == -1)
+ return 10;
+ // Fall through into A2_combinew.
+ }
+ case Hexagon::A2_combinew:
+ return 2;
+
+ case Hexagon::A2_sxtw:
+ return 3;
+
+ case Hexagon::A2_andp:
+ case Hexagon::A2_orp:
+ case Hexagon::A2_xorp:
+ return 1;
+
+ case Hexagon::S2_asl_i_p_or: {
+ unsigned S = MI->getOperand(3).getImm();
+ if (S == 0 || S == 32)
+ return 10;
+ return -1;
+ }
+ case Hexagon::S2_asl_i_p:
+ case Hexagon::S2_asr_i_p:
+ case Hexagon::S2_lsr_i_p:
+ unsigned S = MI->getOperand(2).getImm();
+ if (S == 0 || S == 32)
+ return 10;
+ if (S == 16)
+ return 5;
+ if (S == 48)
+ return 7;
+ return -10;
+ }
+
+ return 0;
+}
+
+
+bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
+ const {
+ unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+ int32_t TotalP = 0;
+
+ for (unsigned DR : Part) {
+ MachineInstr *DefI = MRI->getVRegDef(DR);
+ int32_t P = profit(DefI);
+ if (P == INT_MIN)
+ return false;
+ TotalP += P;
+ // Reduce the profitability of splitting induction registers.
+ if (isInduction(DR, IRM))
+ TotalP -= 30;
+
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U) {
+ MachineInstr *UseI = U->getParent();
+ if (isFixedInstr(UseI)) {
+ FixedNum++;
+ // Calculate the cost of generating REG_SEQUENCE instructions.
+ for (auto &Op : UseI->operands()) {
+ if (Op.isReg() && Part.count(Op.getReg()))
+ if (Op.getSubReg())
+ TotalP -= 2;
+ }
+ continue;
+ }
+ // If a register from this partition is used in a fixed instruction,
+ // and there is also a register in this partition that is used in
+ // a loop phi node, then decrease the splitting profit as this can
+ // confuse the modulo scheduler.
+ if (UseI->isPHI()) {
+ const MachineBasicBlock *PB = UseI->getParent();
+ const MachineLoop *L = MLI->getLoopFor(PB);
+ if (L && L->getHeader() == PB)
+ LoopPhiNum++;
+ }
+ // Splittable instruction.
+ SplitNum++;
+ int32_t P = profit(UseI);
+ if (P == INT_MIN)
+ return false;
+ TotalP += P;
+ }
+ }
+
+ if (FixedNum > 0 && LoopPhiNum > 0)
+ TotalP -= 20*LoopPhiNum;
+
+ DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+ return TotalP > 0;
+}
+
+
+void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
+ USet &Rs) {
+ const MachineBasicBlock *HB = L->getHeader();
+ const MachineBasicBlock *LB = L->getLoopLatch();
+ if (!HB || !LB)
+ return;
+
+ // Examine the latch branch. Expect it to be a conditional branch to
+ // the header (either "br-cond header" or "br-cond exit; br header").
+ MachineBasicBlock *TB = 0, *FB = 0;
+ MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB);
+ SmallVector<MachineOperand,2> Cond;
+ bool BadLB = TII->AnalyzeBranch(*TmpLB, TB, FB, Cond, false);
+ // Only analyzable conditional branches. HII::AnalyzeBranch will put
+ // the branch opcode as the first element of Cond, and the predicate
+ // operand as the second.
+ if (BadLB || Cond.size() != 2)
+ return;
+ // Only simple jump-conditional (with or without negation).
+ if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm()))
+ return;
+ // Must go to the header.
+ if (TB != HB && FB != HB)
+ return;
+ assert(Cond[1].isReg() && "Unexpected Cond vector from AnalyzeBranch");
+ // Expect a predicate register.
+ unsigned PR = Cond[1].getReg();
+ assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
+
+ // Get the registers on which the loop controlling compare instruction
+ // depends.
+ unsigned CmpR1 = 0, CmpR2 = 0;
+ const MachineInstr *CmpI = MRI->getVRegDef(PR);
+ while (CmpI->getOpcode() == Hexagon::C2_not)
+ CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
+
+ int Mask = 0, Val = 0;
+ bool OkCI = TII->analyzeCompare(CmpI, CmpR1, CmpR2, Mask, Val);
+ if (!OkCI)
+ return;
+ // Eliminate non-double input registers.
+ if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC)
+ CmpR1 = 0;
+ if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC)
+ CmpR2 = 0;
+ if (!CmpR1 && !CmpR2)
+ return;
+
+ // Now examine the top of the loop: the phi nodes that could poten-
+ // tially define loop induction registers. The registers defined by
+ // such a phi node would be used in a 64-bit add, which then would
+ // be used in the loop compare instruction.
+
+ // Get the set of all double registers defined by phi nodes in the
+ // loop header.
+ typedef std::vector<unsigned> UVect;
+ UVect DP;
+ for (auto &MI : *HB) {
+ if (!MI.isPHI())
+ break;
+ const MachineOperand &MD = MI.getOperand(0);
+ unsigned R = MD.getReg();
+ if (MRI->getRegClass(R) == DoubleRC)
+ DP.push_back(R);
+ }
+ if (DP.empty())
+ return;
+
+ auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool {
+ for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end();
+ I != E; ++I) {
+ const MachineInstr *UseI = I->getParent();
+ if (UseI->getOpcode() != Hexagon::A2_addp)
+ continue;
+ // Get the output from the add. If it is one of the inputs to the
+ // loop-controlling compare instruction, then R is likely an induc-
+ // tion register.
+ unsigned T = UseI->getOperand(0).getReg();
+ if (T == CmpR1 || T == CmpR2)
+ return false;
+ }
+ return true;
+ };
+ UVect::iterator End = std::remove_if(DP.begin(), DP.end(), NoIndOp);
+ Rs.insert(DP.begin(), End);
+ Rs.insert(CmpR1);
+ Rs.insert(CmpR2);
+
+ DEBUG({
+ dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: ";
+ dump_partition(dbgs(), Rs, *TRI);
+ dbgs() << '\n';
+ });
+}
+
+
+void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
+ typedef std::vector<MachineLoop*> LoopVector;
+ LoopVector WorkQ;
+
+ for (auto I : *MLI)
+ WorkQ.push_back(I);
+ for (unsigned i = 0; i < WorkQ.size(); ++i) {
+ for (auto I : *WorkQ[i])
+ WorkQ.push_back(I);
+ }
+
+ USet Rs;
+ for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
+ MachineLoop *L = WorkQ[i];
+ Rs.clear();
+ collectIndRegsForLoop(L, Rs);
+ if (!Rs.empty())
+ IRM.insert(std::make_pair(L, Rs));
+ }
+}
+
+
+void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
+ const UUPairMap &PairMap, unsigned SubR) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc));
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg()) {
+ NewI->addOperand(Op);
+ continue;
+ }
+ // For register operands, set the subregister.
+ unsigned R = Op.getReg();
+ unsigned SR = Op.getSubReg();
+ bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+ bool isKill = Op.isKill();
+ if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
+ isKill = false;
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end()) {
+ SR = SubR;
+ } else {
+ const UUPair &P = F->second;
+ R = (SubR == Hexagon::subreg_loreg) ? P.first : P.second;
+ SR = 0;
+ }
+ }
+ auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill,
+ Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(),
+ Op.isInternalRead());
+ NewI->addOperand(CO);
+ }
+}
+
+
+void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ bool Load = MI->mayLoad();
+ unsigned OrigOpc = MI->getOpcode();
+ bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi ||
+ OrigOpc == Hexagon::S2_storerd_pi);
+ MachineInstr *LowI, *HighI;
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Index of the base-address-register operand.
+ unsigned AdrX = PostInc ? (Load ? 2 : 1)
+ : (Load ? 1 : 0);
+ MachineOperand &AdrOp = MI->getOperand(AdrX);
+ unsigned RSA = getRegState(AdrOp);
+ MachineOperand &ValOp = Load ? MI->getOperand(0)
+ : (PostInc ? MI->getOperand(3)
+ : MI->getOperand(2));
+ UUPairMap::const_iterator F = PairMap.find(ValOp.getReg());
+ assert(F != PairMap.end());
+
+ if (Load) {
+ const UUPair &P = F->second;
+ int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm();
+ LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first)
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off);
+ HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second)
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off+4);
+ } else {
+ const UUPair &P = F->second;
+ int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm();
+ LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off)
+ .addReg(P.first);
+ HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+ .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+ .addImm(Off+4)
+ .addReg(P.second);
+ }
+
+ if (PostInc) {
+ // Create the increment of the address register.
+ int64_t Inc = Load ? MI->getOperand(3).getImm()
+ : MI->getOperand(2).getImm();
+ MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
+ const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
+ unsigned NewR = MRI->createVirtualRegister(RC);
+ assert(!UpdOp.getSubReg() && "Def operand with subreg");
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
+ .addReg(AdrOp.getReg(), RSA)
+ .addImm(Inc);
+ MRI->replaceRegWith(UpdOp.getReg(), NewR);
+ // The original instruction will be deleted later.
+ }
+
+ // Generate a new pair of memory-operands.
+ MachineFunction &MF = *B.getParent();
+ for (auto &MO : MI->memoperands()) {
+ const MachinePointerInfo &Ptr = MO->getPointerInfo();
+ unsigned F = MO->getFlags();
+ int A = MO->getAlignment();
+
+ auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
+ LowI->addMemOperand(MF, Tmp1);
+ auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4));
+ HighI->addMemOperand(MF, Tmp2);
+ }
+}
+
+
+void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ assert(Op0.isReg() && Op1.isImm());
+ uint64_t V = Op1.getImm();
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+
+ // The operand to A2_tfrsi can only have 32 significant bits. Immediate
+ // values in MachineOperand are stored as 64-bit integers, and so the
+ // value -1 may be represented either as 64-bit -1, or 4294967295. Both
+ // will have the 32 higher bits truncated in the end, but -1 will remain
+ // as -1, while the latter may appear to be a large unsigned value
+ // requiring a constant extender. The casting to int32_t will select the
+ // former representation. (The same reasoning applies to all 32-bit
+ // values.)
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+ .addImm(int32_t(V & 0xFFFFFFFFULL));
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+ .addImm(int32_t(V >> 32));
+}
+
+
+void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ assert(Op0.isReg());
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+
+ if (Op1.isImm()) {
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+ .addImm(Op1.getImm());
+ } else if (Op1.isReg()) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
+ .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
+ } else
+ llvm_unreachable("Unexpected operand");
+
+ if (Op2.isImm()) {
+ BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+ .addImm(Op2.getImm());
+ } else if (Op2.isReg()) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+ .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
+ } else
+ llvm_unreachable("Unexpected operand");
+}
+
+
+void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ assert(Op0.isReg() && Op1.isReg());
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned RS = getRegState(Op1);
+
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg());
+ BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second)
+ .addReg(Op1.getReg(), RS, Op1.getSubReg())
+ .addImm(31);
+}
+
+
+void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ assert(Op0.isReg() && Op1.isReg() && Op2.isImm());
+ int64_t Sh64 = Op2.getImm();
+ assert(Sh64 >= 0 && Sh64 < 64);
+ unsigned S = Sh64;
+
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned LoR = P.first;
+ unsigned HiR = P.second;
+ using namespace Hexagon;
+
+ unsigned Opc = MI->getOpcode();
+ bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
+ bool Left = !Right;
+ bool Signed = (Opc == S2_asr_i_p);
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned RS = getRegState(Op1);
+ unsigned ShiftOpc = Left ? S2_asl_i_r
+ : (Signed ? S2_asr_i_r : S2_lsr_i_r);
+ unsigned LoSR = subreg_loreg;
+ unsigned HiSR = subreg_hireg;
+
+ if (S == 0) {
+ // No shift, subregister copy.
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR)
+ .addReg(Op1.getReg(), RS, HiSR);
+ } else if (S < 32) {
+ const TargetRegisterClass *IntRC = &IntRegsRegClass;
+ unsigned TmpR = MRI->createVirtualRegister(IntRC);
+ // Expansion:
+ // Shift left: DR = shl R, #s
+ // LoR = shl R.lo, #s
+ // TmpR = extractu R.lo, #s, #32-s
+ // HiR = or (TmpR, asl(R.hi, #s))
+ // Shift right: DR = shr R, #s
+ // HiR = shr R.hi, #s
+ // TmpR = shr R.lo, #s
+ // LoR = insert TmpR, R.hi, #s, #32-s
+
+ // Shift left:
+ // LoR = shl R.lo, #s
+ // Shift right:
+ // TmpR = shr R.lo, #s
+
+ // Make a special case for A2_aslh and A2_asrh (they are predicable as
+ // opposed to S2_asl_i_r/S2_asr_i_r).
+ if (S == 16 && Left)
+ BuildMI(B, MI, DL, TII->get(A2_aslh), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else if (S == 16 && Signed)
+ BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+ .addImm(S);
+
+ if (Left) {
+ // TmpR = extractu R.lo, #s, #32-s
+ BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+ .addImm(S)
+ .addImm(32-S);
+ // HiR = or (TmpR, asl(R.hi, #s))
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(TmpR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(S);
+ } else {
+ // HiR = shr R.hi, #s
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR)
+ .addImm(S);
+ // LoR = insert TmpR, R.hi, #s, #32-s
+ BuildMI(B, MI, DL, TII->get(S2_insert), LoR)
+ .addReg(TmpR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(S)
+ .addImm(32-S);
+ }
+ } else if (S == 32) {
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR));
+ if (!Signed)
+ BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+ .addImm(0);
+ else // Must be right shift.
+ BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(31);
+ } else if (S < 64) {
+ S -= 32;
+ if (S == 16 && Left)
+ BuildMI(B, MI, DL, TII->get(A2_aslh), HiR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+ else if (S == 16 && Signed)
+ BuildMI(B, MI, DL, TII->get(A2_asrh), LoR)
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR);
+ else
+ BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR))
+ .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR))
+ .addImm(S);
+
+ if (Signed)
+ BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+ .addReg(Op1.getReg(), RS, HiSR)
+ .addImm(31);
+ else
+ BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+ .addImm(0);
+ }
+}
+
+
+void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineOperand &Op0 = MI->getOperand(0);
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ MachineOperand &Op3 = MI->getOperand(3);
+ assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm());
+ int64_t Sh64 = Op3.getImm();
+ assert(Sh64 >= 0 && Sh64 < 64);
+ unsigned S = Sh64;
+
+ UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+ assert(F != PairMap.end());
+ const UUPair &P = F->second;
+ unsigned LoR = P.first;
+ unsigned HiR = P.second;
+ using namespace Hexagon;
+
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned RS1 = getRegState(Op1);
+ unsigned RS2 = getRegState(Op2);
+ const TargetRegisterClass *IntRC = &IntRegsRegClass;
+
+ unsigned LoSR = subreg_loreg;
+ unsigned HiSR = subreg_hireg;
+
+ // Op0 = S2_asl_i_p_or Op1, Op2, Op3
+ // means: Op0 = or (Op1, asl(Op2, Op3))
+
+ // Expansion of
+ // DR = or (R1, asl(R2, #s))
+ //
+ // LoR = or (R1.lo, asl(R2.lo, #s))
+ // Tmp1 = extractu R2.lo, #s, #32-s
+ // Tmp2 = or R1.hi, Tmp1
+ // HiR = or (Tmp2, asl(R2.hi, #s))
+
+ if (S == 0) {
+ // DR = or (R1, asl(R2, #0))
+ // -> or (R1, R2)
+ // i.e. LoR = or R1.lo, R2.lo
+ // HiR = or R1.hi, R2.hi
+ BuildMI(B, MI, DL, TII->get(A2_or), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, HiSR);
+ } else if (S < 32) {
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+ .addImm(S);
+ unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+ BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
+ .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+ .addImm(S)
+ .addImm(32-S);
+ unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+ BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(TmpR1);
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(TmpR2)
+ .addReg(Op2.getReg(), RS2, HiSR)
+ .addImm(S);
+ } else if (S == 32) {
+ // DR = or (R1, asl(R2, #32))
+ // -> or R1, R2.lo
+ // LoR = R1.lo
+ // HiR = or R1.hi, R2.lo
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, LoSR);
+ } else if (S < 64) {
+ // DR = or (R1, asl(R2, #s))
+ //
+ // LoR = R1:lo
+ // HiR = or (R1:hi, asl(R2:lo, #s-32))
+ S -= 32;
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+ .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+ BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+ .addReg(Op1.getReg(), RS1, HiSR)
+ .addReg(Op2.getReg(), RS2, LoSR)
+ .addImm(S);
+ }
+}
+
+
+bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ DEBUG(dbgs() << "Splitting: " << *MI);
+ bool Split = false;
+ unsigned Opc = MI->getOpcode();
+ using namespace Hexagon;
+
+ switch (Opc) {
+ case TargetOpcode::PHI:
+ case TargetOpcode::COPY: {
+ unsigned DstR = MI->getOperand(0).getReg();
+ if (MRI->getRegClass(DstR) == DoubleRC) {
+ createHalfInstr(Opc, MI, PairMap, subreg_loreg);
+ createHalfInstr(Opc, MI, PairMap, subreg_hireg);
+ Split = true;
+ }
+ break;
+ }
+ case A2_andp:
+ createHalfInstr(A2_and, MI, PairMap, subreg_loreg);
+ createHalfInstr(A2_and, MI, PairMap, subreg_hireg);
+ Split = true;
+ break;
+ case A2_orp:
+ createHalfInstr(A2_or, MI, PairMap, subreg_loreg);
+ createHalfInstr(A2_or, MI, PairMap, subreg_hireg);
+ Split = true;
+ break;
+ case A2_xorp:
+ createHalfInstr(A2_xor, MI, PairMap, subreg_loreg);
+ createHalfInstr(A2_xor, MI, PairMap, subreg_hireg);
+ Split = true;
+ break;
+
+ case L2_loadrd_io:
+ case L2_loadrd_pi:
+ case S2_storerd_io:
+ case S2_storerd_pi:
+ splitMemRef(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_tfrpi:
+ case CONST64_Int_Real:
+ splitImmediate(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_combineii:
+ case A4_combineir:
+ case A4_combineii:
+ case A4_combineri:
+ case A2_combinew:
+ splitCombine(MI, PairMap);
+ Split = true;
+ break;
+
+ case A2_sxtw:
+ splitExt(MI, PairMap);
+ Split = true;
+ break;
+
+ case S2_asl_i_p:
+ case S2_asr_i_p:
+ case S2_lsr_i_p:
+ splitShift(MI, PairMap);
+ Split = true;
+ break;
+
+ case S2_asl_i_p_or:
+ splitAslOr(MI, PairMap);
+ Split = true;
+ break;
+
+ default:
+ llvm_unreachable("Instruction not splitable");
+ return false;
+ }
+
+ return Split;
+}
+
+
+void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
+ continue;
+ unsigned R = Op.getReg();
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end())
+ continue;
+ const UUPair &P = F->second;
+ switch (Op.getSubReg()) {
+ case Hexagon::subreg_loreg:
+ Op.setReg(P.first);
+ break;
+ case Hexagon::subreg_hireg:
+ Op.setReg(P.second);
+ break;
+ }
+ Op.setSubReg(0);
+ }
+}
+
+
+void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
+ const UUPairMap &PairMap) {
+ MachineBasicBlock &B = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ for (auto &Op : MI->operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ unsigned R = Op.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(R))
+ continue;
+ if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
+ continue;
+ UUPairMap::const_iterator F = PairMap.find(R);
+ if (F == PairMap.end())
+ continue;
+ const UUPair &Pr = F->second;
+ unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+ BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
+ .addReg(Pr.first)
+ .addImm(Hexagon::subreg_loreg)
+ .addReg(Pr.second)
+ .addImm(Hexagon::subreg_hireg);
+ Op.setReg(NewDR);
+ }
+}
+
+
+bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
+ const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+ typedef std::set<MachineInstr*> MISet;
+ bool Changed = false;
+
+ DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
+ dbgs() << '\n');
+
+ UUPairMap PairMap;
+
+ MISet SplitIns;
+ for (unsigned DR : Part) {
+ MachineInstr *DefI = MRI->getVRegDef(DR);
+ SplitIns.insert(DefI);
+
+ // Collect all instructions, including fixed ones. We won't split them,
+ // but we need to visit them again to insert the REG_SEQUENCE instructions.
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U)
+ SplitIns.insert(U->getParent());
+
+ unsigned LoR = MRI->createVirtualRegister(IntRC);
+ unsigned HiR = MRI->createVirtualRegister(IntRC);
+ DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> "
+ << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n');
+ PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
+ }
+
+ MISet Erase;
+ for (auto MI : SplitIns) {
+ if (isFixedInstr(MI)) {
+ collapseRegPairs(MI, PairMap);
+ } else {
+ bool Done = splitInstr(MI, PairMap);
+ if (Done)
+ Erase.insert(MI);
+ Changed |= Done;
+ }
+ }
+
+ for (unsigned DR : Part) {
+ // Before erasing "double" instructions, revisit all uses of the double
+ // registers in this partition, and replace all uses of them with subre-
+ // gisters, with the corresponding single registers.
+ MISet Uses;
+ for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+ U != W; ++U)
+ Uses.insert(U->getParent());
+ for (auto M : Uses)
+ replaceSubregUses(M, PairMap);
+ }
+
+ for (auto MI : Erase) {
+ MachineBasicBlock *B = MI->getParent();
+ B->erase(MI);
+ }
+
+ return Changed;
+}
+
+
+bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "Splitting double registers in function: "
+ << MF.getName() << '\n');
+
+ auto &ST = MF.getSubtarget<HexagonSubtarget>();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+
+ UUSetMap P2Rs;
+ LoopRegMap IRM;
+
+ collectIndRegs(IRM);
+ partitionRegisters(P2Rs);
+
+ DEBUG({
+ dbgs() << "Register partitioning: (partition #0 is fixed)\n";
+ for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+ dbgs() << '#' << I->first << " -> ";
+ dump_partition(dbgs(), I->second, *TRI);
+ dbgs() << '\n';
+ }
+ });
+
+ bool Changed = false;
+ int Limit = MaxHSDR;
+
+ for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+ if (I->first == 0)
+ continue;
+ if (Limit >= 0 && Counter >= Limit)
+ break;
+ USet &Part = I->second;
+ DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+ if (!isProfitable(Part, IRM))
+ continue;
+ Counter++;
+ Changed |= splitPartition(Part);
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createHexagonSplitDoubleRegs() {
+ return new HexagonSplitDoubleRegs();
+}
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
new file mode 100644
index 000000000000..b5339ff4c0dc
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -0,0 +1,616 @@
+//===--- HexagonStoreWidening.cpp------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Replace sequences of "narrow" stores to adjacent memory locations with
+// a fewer "wide" stores that have the same effect.
+// For example, replace:
+// S4_storeirb_io %vreg100, 0, 0 ; store-immediate-byte
+// S4_storeirb_io %vreg100, 1, 0 ; store-immediate-byte
+// with
+// S4_storeirh_io %vreg100, 0, 0 ; store-immediate-halfword
+// The above is the general idea. The actual cases handled by the code
+// may be a bit more complex.
+// The purpose of this pass is to reduce the number of outstanding stores,
+// or as one could say, "reduce store queue pressure". Also, wide stores
+// mean fewer stores, and since there are only two memory instructions allowed
+// per packet, it also means fewer packets, and ultimately fewer cycles.
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-widen-stores"
+
+#include "HexagonTargetMachine.h"
+
+#include "llvm/PassSupport.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include <algorithm>
+
+
+using namespace llvm;
+
+namespace llvm {
+ FunctionPass *createHexagonStoreWidening();
+ void initializeHexagonStoreWideningPass(PassRegistry&);
+}
+
+namespace {
+ struct HexagonStoreWidening : public MachineFunctionPass {
+ const HexagonInstrInfo *TII;
+ const HexagonRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ AliasAnalysis *AA;
+ MachineFunction *MF;
+
+ public:
+ static char ID;
+ HexagonStoreWidening() : MachineFunctionPass(ID) {
+ initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "Hexagon Store Widening";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ static bool handledStoreType(const MachineInstr *MI);
+
+ private:
+ static const int MaxWideSize = 4;
+
+ typedef std::vector<MachineInstr*> InstrGroup;
+ typedef std::vector<InstrGroup> InstrGroupList;
+
+ bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO);
+ bool instrAliased(InstrGroup &Stores, const MachineInstr *MI);
+ void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin,
+ InstrGroup::iterator End, InstrGroup &Group);
+ void createStoreGroups(MachineBasicBlock &MBB,
+ InstrGroupList &StoreGroups);
+ bool processBasicBlock(MachineBasicBlock &MBB);
+ bool processStoreGroup(InstrGroup &Group);
+ bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End,
+ InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
+ bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
+ bool replaceStores(InstrGroup &OG, InstrGroup &NG);
+ bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2);
+ };
+
+} // namespace
+
+
+namespace {
+
+// Some local helper functions...
+unsigned getBaseAddressRegister(const MachineInstr *MI) {
+ const MachineOperand &MO = MI->getOperand(0);
+ assert(MO.isReg() && "Expecting register operand");
+ return MO.getReg();
+}
+
+int64_t getStoreOffset(const MachineInstr *MI) {
+ unsigned OpC = MI->getOpcode();
+ assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode");
+
+ switch (OpC) {
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeiri_io: {
+ const MachineOperand &MO = MI->getOperand(1);
+ assert(MO.isImm() && "Expecting immediate offset");
+ return MO.getImm();
+ }
+ }
+ dbgs() << *MI;
+ llvm_unreachable("Store offset calculation missing for a handled opcode");
+ return 0;
+}
+
+const MachineMemOperand &getStoreTarget(const MachineInstr *MI) {
+ assert(!MI->memoperands_empty() && "Expecting memory operands");
+ return **MI->memoperands_begin();
+}
+
+} // namespace
+
+
+char HexagonStoreWidening::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
+ "Hexason Store Widening", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
+ "Hexagon Store Widening", false, false)
+
+
+// Filtering function: any stores whose opcodes are not "approved" of by
+// this function will not be subjected to widening.
+inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) {
+ // For now, only handle stores of immediate values.
+ // Also, reject stores to stack slots.
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::S4_storeirb_io:
+ case Hexagon::S4_storeirh_io:
+ case Hexagon::S4_storeiri_io:
+ // Base address must be a register. (Implement FI later.)
+ return MI->getOperand(0).isReg();
+ default:
+ return false;
+ }
+}
+
+
+// Check if the machine memory operand MMO is aliased with any of the
+// stores in the store group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+ const MachineMemOperand &MMO) {
+ if (!MMO.getValue())
+ return true;
+
+ MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo());
+
+ for (auto SI : Stores) {
+ const MachineMemOperand &SMO = getStoreTarget(SI);
+ if (!SMO.getValue())
+ return true;
+
+ MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo());
+ if (AA->alias(L, SL))
+ return true;
+ }
+
+ return false;
+}
+
+
+// Check if the machine instruction MI accesses any storage aliased with
+// any store in the group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+ const MachineInstr *MI) {
+ for (auto &I : MI->memoperands())
+ if (instrAliased(Stores, *I))
+ return true;
+ return false;
+}
+
+
+// Inspect a machine basic block, and generate store groups out of stores
+// encountered in the block.
+//
+// A store group is a group of stores that use the same base register,
+// and which can be reordered within that group without altering the
+// semantics of the program. A single store group could be widened as
+// a whole, if there existed a single store instruction with the same
+// semantics as the entire group. In many cases, a single store group
+// may need more than one wide store.
+void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB,
+ InstrGroupList &StoreGroups) {
+ InstrGroup AllInsns;
+
+ // Copy all instruction pointers from the basic block to a temporary
+ // list. This will allow operating on the list, and modifying its
+ // elements without affecting the basic block.
+ for (auto &I : MBB)
+ AllInsns.push_back(&I);
+
+ // Traverse all instructions in the AllInsns list, and if we encounter
+ // a store, then try to create a store group starting at that instruction
+ // i.e. a sequence of independent stores that can be widened.
+ for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ // Skip null pointers (processed instructions).
+ if (!MI || !handledStoreType(MI))
+ continue;
+
+ // Found a store. Try to create a store group.
+ InstrGroup G;
+ createStoreGroup(MI, I+1, E, G);
+ if (G.size() > 1)
+ StoreGroups.push_back(G);
+ }
+}
+
+
+// Create a single store group. The stores need to be independent between
+// themselves, and also there cannot be other instructions between them
+// that could read or modify storage being stored into.
+void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore,
+ InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) {
+ assert(handledStoreType(BaseStore) && "Unexpected instruction");
+ unsigned BaseReg = getBaseAddressRegister(BaseStore);
+ InstrGroup Other;
+
+ Group.push_back(BaseStore);
+
+ for (auto I = Begin; I != End; ++I) {
+ MachineInstr *MI = *I;
+ if (!MI)
+ continue;
+
+ if (handledStoreType(MI)) {
+ // If this store instruction is aliased with anything already in the
+ // group, terminate the group now.
+ if (instrAliased(Group, getStoreTarget(MI)))
+ return;
+ // If this store is aliased to any of the memory instructions we have
+ // seen so far (that are not a part of this group), terminate the group.
+ if (instrAliased(Other, getStoreTarget(MI)))
+ return;
+
+ unsigned BR = getBaseAddressRegister(MI);
+ if (BR == BaseReg) {
+ Group.push_back(MI);
+ *I = 0;
+ continue;
+ }
+ }
+
+ // Assume calls are aliased to everything.
+ if (MI->isCall() || MI->hasUnmodeledSideEffects())
+ return;
+
+ if (MI->mayLoad() || MI->mayStore()) {
+ if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI))
+ return;
+ Other.push_back(MI);
+ }
+ } // for
+}
+
+
+// Check if store instructions S1 and S2 are adjacent. More precisely,
+// S2 has to access memory immediately following that accessed by S1.
+bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
+ const MachineInstr *S2) {
+ if (!handledStoreType(S1) || !handledStoreType(S2))
+ return false;
+
+ const MachineMemOperand &S1MO = getStoreTarget(S1);
+
+ // Currently only handling immediate stores.
+ int Off1 = S1->getOperand(1).getImm();
+ int Off2 = S2->getOperand(1).getImm();
+
+ return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
+ : int(Off1+S1MO.getSize()) == Off2;
+}
+
+
+/// Given a sequence of adjacent stores, and a maximum size of a single wide
+/// store, pick a group of stores that can be replaced by a single store
+/// of size not exceeding MaxSize. The selected sequence will be recorded
+/// in OG ("old group" of instructions).
+/// OG should be empty on entry, and should be left empty if the function
+/// fails.
+bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
+ InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize,
+ unsigned MaxSize) {
+ assert(Begin != End && "No instructions to analyze");
+ assert(OG.empty() && "Old group not empty on entry");
+
+ if (std::distance(Begin, End) <= 1)
+ return false;
+
+ MachineInstr *FirstMI = *Begin;
+ assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
+ const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
+ unsigned Alignment = FirstMMO.getAlignment();
+ unsigned SizeAccum = FirstMMO.getSize();
+ unsigned FirstOffset = getStoreOffset(FirstMI);
+
+ // The initial value of SizeAccum should always be a power of 2.
+ assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");
+
+ // If the size of the first store equals to or exceeds the limit, do nothing.
+ if (SizeAccum >= MaxSize)
+ return false;
+
+ // If the size of the first store is greater than or equal to the address
+ // stored to, then the store cannot be made any wider.
+ if (SizeAccum >= Alignment)
+ return false;
+
+ // The offset of a store will put restrictions on how wide the store can be.
+ // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0.
+ // If the first store already exhausts the offset limits, quit. Test this
+ // by checking if the next wider size would exceed the limit.
+ if ((2*SizeAccum-1) & FirstOffset)
+ return false;
+
+ OG.push_back(FirstMI);
+ MachineInstr *S1 = FirstMI, *S2 = *(Begin+1);
+ InstrGroup::iterator I = Begin+1;
+
+ // Pow2Num will be the largest number of elements in OG such that the sum
+ // of sizes of stores 0...Pow2Num-1 will be a power of 2.
+ unsigned Pow2Num = 1;
+ unsigned Pow2Size = SizeAccum;
+
+ // Be greedy: keep accumulating stores as long as they are to adjacent
+ // memory locations, and as long as the total number of bytes stored
+ // does not exceed the limit (MaxSize).
+ // Keep track of when the total size covered is a power of 2, since
+ // this is a size a single store can cover.
+ while (I != End) {
+ S2 = *I;
+ // Stores are sorted, so if S1 and S2 are not adjacent, there won't be
+ // any other store to fill the "hole".
+ if (!storesAreAdjacent(S1, S2))
+ break;
+
+ unsigned S2Size = getStoreTarget(S2).getSize();
+ if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
+ break;
+
+ OG.push_back(S2);
+ SizeAccum += S2Size;
+ if (isPowerOf2_32(SizeAccum)) {
+ Pow2Num = OG.size();
+ Pow2Size = SizeAccum;
+ }
+ if ((2*Pow2Size-1) & FirstOffset)
+ break;
+
+ S1 = S2;
+ ++I;
+ }
+
+ // The stores don't add up to anything that can be widened. Clean up.
+ if (Pow2Num <= 1) {
+ OG.clear();
+ return false;
+ }
+
+ // Only leave the stored being widened.
+ OG.resize(Pow2Num);
+ TotalSize = Pow2Size;
+ return true;
+}
+
+
+/// Given an "old group" OG of stores, create a "new group" NG of instructions
+/// to replace them. Ideally, NG would only have a single instruction in it,
+/// but that may only be possible for store-immediate.
+bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
+ unsigned TotalSize) {
+ // XXX Current limitations:
+ // - only expect stores of immediate values in OG,
+ // - only handle a TotalSize of up to 4.
+
+ if (TotalSize > 4)
+ return false;
+
+ unsigned Acc = 0; // Value accumulator.
+ unsigned Shift = 0;
+
+ for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) {
+ MachineInstr *MI = *I;
+ const MachineMemOperand &MMO = getStoreTarget(MI);
+ MachineOperand &SO = MI->getOperand(2); // Source.
+ assert(SO.isImm() && "Expecting an immediate operand");
+
+ unsigned NBits = MMO.getSize()*8;
+ unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
+ unsigned Val = (SO.getImm() & Mask) << Shift;
+ Acc |= Val;
+ Shift += NBits;
+ }
+
+
+ MachineInstr *FirstSt = OG.front();
+ DebugLoc DL = OG.back()->getDebugLoc();
+ const MachineMemOperand &OldM = getStoreTarget(FirstSt);
+ MachineMemOperand *NewM =
+ MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
+ TotalSize, OldM.getAlignment(),
+ OldM.getAAInfo());
+
+ if (Acc < 0x10000) {
+ // Create mem[hw] = #Acc
+ unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io :
+ (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0;
+ assert(WOpc && "Unexpected size");
+
+ int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
+ const MCInstrDesc &StD = TII->get(WOpc);
+ MachineOperand &MR = FirstSt->getOperand(0);
+ int64_t Off = FirstSt->getOperand(1).getImm();
+ MachineInstr *StI = BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+ .addImm(Off)
+ .addImm(Val);
+ StI->addMemOperand(*MF, NewM);
+ NG.push_back(StI);
+ } else {
+ // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
+ const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
+ const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+ unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+ MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
+ .addImm(int(Acc));
+ NG.push_back(TfrI);
+
+ unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io :
+ (TotalSize == 4) ? Hexagon::S2_storeri_io : 0;
+ assert(WOpc && "Unexpected size");
+
+ const MCInstrDesc &StD = TII->get(WOpc);
+ MachineOperand &MR = FirstSt->getOperand(0);
+ int64_t Off = FirstSt->getOperand(1).getImm();
+ MachineInstr *StI = BuildMI(*MF, DL, StD)
+ .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+ .addImm(Off)
+ .addReg(VReg, RegState::Kill);
+ StI->addMemOperand(*MF, NewM);
+ NG.push_back(StI);
+ }
+
+ return true;
+}
+
+
+// Replace instructions from the old group OG with instructions from the
+// new group NG. Conceptually, remove all instructions in OG, and then
+// insert all instructions in NG, starting at where the first instruction
+// from OG was (in the order in which they appeared in the basic block).
+// (The ordering in OG does not have to match the order in the basic block.)
+bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
+ DEBUG({
+ dbgs() << "Replacing:\n";
+ for (auto I : OG)
+ dbgs() << " " << *I;
+ dbgs() << "with\n";
+ for (auto I : NG)
+ dbgs() << " " << *I;
+ });
+
+ MachineBasicBlock *MBB = OG.back()->getParent();
+ MachineBasicBlock::iterator InsertAt = MBB->end();
+
+ // Need to establish the insertion point. The best one is right before
+ // the first store in the OG, but in the order in which the stores occur
+ // in the program list. Since the ordering in OG does not correspond
+ // to the order in the program list, we need to do some work to find
+ // the insertion point.
+
+ // Create a set of all instructions in OG (for quick lookup).
+ SmallPtrSet<MachineInstr*, 4> InstrSet;
+ for (auto I : OG)
+ InstrSet.insert(I);
+
+ // Traverse the block, until we hit an instruction from OG.
+ for (auto &I : *MBB) {
+ if (InstrSet.count(&I)) {
+ InsertAt = I;
+ break;
+ }
+ }
+
+ assert((InsertAt != MBB->end()) && "Cannot locate any store from the group");
+
+ bool AtBBStart = false;
+
+ // InsertAt points at the first instruction that will be removed. We need
+ // to move it out of the way, so it remains valid after removing all the
+ // old stores, and so we are able to recover it back to the proper insertion
+ // position.
+ if (InsertAt != MBB->begin())
+ --InsertAt;
+ else
+ AtBBStart = true;
+
+ for (auto I : OG)
+ I->eraseFromParent();
+
+ if (!AtBBStart)
+ ++InsertAt;
+ else
+ InsertAt = MBB->begin();
+
+ for (auto I : NG)
+ MBB->insert(InsertAt, I);
+
+ return true;
+}
+
+
+// Break up the group into smaller groups, each of which can be replaced by
+// a single wide store. Widen each such smaller group and replace the old
+// instructions with the widened ones.
+bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) {
+ bool Changed = false;
+ InstrGroup::iterator I = Group.begin(), E = Group.end();
+ InstrGroup OG, NG; // Old and new groups.
+ unsigned CollectedSize;
+
+ while (I != E) {
+ OG.clear();
+ NG.clear();
+
+ bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) &&
+ createWideStores(OG, NG, CollectedSize) &&
+ replaceStores(OG, NG);
+ if (!Succ)
+ continue;
+
+ assert(OG.size() > 1 && "Created invalid group");
+ assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements");
+ I += OG.size()-1;
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+
+// Process a single basic block: create the store groups, and replace them
+// with the widened stores, if possible. Processing of each basic block
+// is independent from processing of any other basic block. This transfor-
+// mation could be stopped after having processed any basic block without
+// any ill effects (other than not having performed widening in the unpro-
+// cessed blocks). Also, the basic blocks can be processed in any order.
+bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
+ InstrGroupList SGs;
+ bool Changed = false;
+
+ createStoreGroups(MBB, SGs);
+
+ auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool {
+ return getStoreOffset(A) < getStoreOffset(B);
+ };
+ for (auto &G : SGs) {
+ assert(G.size() > 1 && "Store group with fewer than 2 elements");
+ std::sort(G.begin(), G.end(), Less);
+
+ Changed |= processStoreGroup(G);
+ }
+
+ return Changed;
+}
+
+
+bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) {
+ MF = &MFn;
+ auto &ST = MFn.getSubtarget<HexagonSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MRI = &MFn.getRegInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ bool Changed = false;
+
+ for (auto &B : MFn)
+ Changed |= processBasicBlock(B);
+
+ return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonStoreWidening() {
+ return new HexagonStoreWidening();
+}
+
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index cd482b3e3af1..aa0efd4f65e0 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -16,6 +16,8 @@
#include "HexagonRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include <map>
+
using namespace llvm;
#define DEBUG_TYPE "hexagon-subtarget"
@@ -24,49 +26,65 @@ using namespace llvm;
#define GET_SUBTARGETINFO_TARGET_DESC
#include "HexagonGenSubtargetInfo.inc"
-static cl::opt<bool>
-EnableV3("enable-hexagon-v3", cl::Hidden,
- cl::desc("Enable Hexagon V3 instructions."));
-
-static cl::opt<bool>
-EnableMemOps(
- "enable-hexagon-memops",
- cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
- cl::desc(
- "Generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool>
-DisableMemOps(
- "disable-hexagon-memops",
- cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
- cl::desc(
- "Do not generate V4 MEMOP in code generation for Hexagon target"));
-
-static cl::opt<bool>
-EnableIEEERndNear(
- "enable-hexagon-ieee-rnd-near",
- cl::Hidden, cl::ZeroOrMore, cl::init(false),
- cl::desc("Generate non-chopped conversion from fp to int."));
+static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
+ cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+ cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
+ cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+ cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Generate non-chopped conversion from fp to int."));
+
+static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
+ cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon Double Vector eXtensions"));
+
+static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Enable Hexagon Vector eXtensions"));
static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
- cl::Hidden, cl::ZeroOrMore, cl::init(false),
- cl::desc("Disable Hexagon MI Scheduling"));
+ cl::Hidden, cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon MI Scheduling"));
+
+void HexagonSubtarget::initializeEnvironment() {
+ UseMemOps = false;
+ ModeIEEERndNear = false;
+ UseBSBScheduling = false;
+}
HexagonSubtarget &
HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
- // If the programmer has not specified a Hexagon version, default to -mv4.
- if (CPUString.empty())
- CPUString = "hexagonv4";
-
- if (CPUString == "hexagonv4") {
- HexagonArchVersion = V4;
- } else if (CPUString == "hexagonv5") {
- HexagonArchVersion = V5;
- } else {
+ CPUString = HEXAGON_MC::selectHexagonCPU(getTargetTriple(), CPU);
+
+ static std::map<StringRef, HexagonArchEnum> CpuTable {
+ { "hexagonv4", V4 },
+ { "hexagonv5", V5 },
+ { "hexagonv55", V55 },
+ { "hexagonv60", V60 },
+ };
+
+ auto foundIt = CpuTable.find(CPUString);
+ if (foundIt != CpuTable.end())
+ HexagonArchVersion = foundIt->second;
+ else
llvm_unreachable("Unrecognized Hexagon processor version");
- }
+ UseHVXOps = false;
+ UseHVXDblOps = false;
ParseSubtargetFeatures(CPUString, FS);
+
+ if (EnableHexagonHVX.getPosition())
+ UseHVXOps = EnableHexagonHVX;
+ if (EnableHexagonHVXDouble.getPosition())
+ UseHVXDblOps = EnableHexagonHVXDouble;
+
return *this;
}
@@ -76,6 +94,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
FrameLowering() {
+ initializeEnvironment();
+
// Initialize scheduling itinerary for the specified CPU.
InstrItins = getInstrItineraryForCPU(CPUString);
@@ -91,6 +111,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
ModeIEEERndNear = true;
else
ModeIEEERndNear = false;
+
+ UseBSBScheduling = hasV60TOps() && EnableBSBSched;
}
// Pin the vtable to this file.
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 34cdad786f82..c7ae139c4346 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -34,15 +34,19 @@ namespace llvm {
class HexagonSubtarget : public HexagonGenSubtargetInfo {
virtual void anchor();
- bool UseMemOps;
+ bool UseMemOps, UseHVXOps, UseHVXDblOps;
bool ModeIEEERndNear;
public:
enum HexagonArchEnum {
- V4, V5
+ V4, V5, V55, V60
};
HexagonArchEnum HexagonArchVersion;
+ /// True if the target should use Back-Skip-Back scheduling. This is the
+ /// default for V60.
+ bool UseBSBScheduling;
+
private:
std::string CPUString;
HexagonInstrInfo InstrInfo;
@@ -50,6 +54,7 @@ private:
HexagonSelectionDAGInfo TSInfo;
HexagonFrameLowering FrameLowering;
InstrItineraryData InstrItins;
+ void initializeEnvironment();
public:
HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -84,7 +89,16 @@ public:
bool useMemOps() const { return UseMemOps; }
bool hasV5TOps() const { return getHexagonArchVersion() >= V5; }
bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; }
+ bool hasV55TOps() const { return getHexagonArchVersion() >= V55; }
+ bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
+ bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
+ bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
bool modeIEEERndNear() const { return ModeIEEERndNear; }
+ bool useHVXOps() const { return UseHVXOps; }
+ bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
+ bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; }
+
+ bool useBSBScheduling() const { return UseBSBScheduling; }
bool enableMachineScheduler() const override;
// Always use the TargetLowering default scheduler.
// FIXME: This will use the vliw scheduler which is probably just hurting
@@ -98,7 +112,7 @@ public:
return Hexagon_SMALL_DATA_THRESHOLD;
}
const HexagonArchEnum &getHexagonArchVersion() const {
- return HexagonArchVersion;
+ return HexagonArchVersion;
}
};
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index b50442969a29..9dccd696c989 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -16,12 +16,12 @@
#include "HexagonISelLowering.h"
#include "HexagonMachineScheduler.h"
#include "HexagonTargetObjectFile.h"
+#include "HexagonTargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
using namespace llvm;
@@ -33,10 +33,16 @@ static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
cl::Hidden, cl::ZeroOrMore, cl::init(false),
cl::desc("Disable Hexagon CFG Optimization"));
+static cl::opt<bool> DisableStoreWidening("disable-store-widen",
+ cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+
static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
cl::init(true), cl::Hidden, cl::ZeroOrMore,
cl::desc("Early expansion of MUX"));
+static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
+ cl::ZeroOrMore, cl::desc("Enable early if-conversion"));
+
static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
cl::Hidden, cl::desc("Generate \"insert\" instructions"));
@@ -46,10 +52,22 @@ static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
+ cl::desc("Enable converting conditional transfers into MUX instructions"));
+
static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
"predicate instructions"));
+static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+ cl::desc("Disable splitting double registers"));
+
+static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
+ cl::Hidden, cl::desc("Bit simplification"));
+
+static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
+ cl::Hidden, cl::desc("Loop rescheduling"));
+
/// HexagonTargetMachineModule - Note that this is used on hosts that
/// cannot link in a library unless there are references into the
/// library. In particular, it seems that it is not possible to get
@@ -72,23 +90,30 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
createVLIWMachineSched);
namespace llvm {
+ FunctionPass *createHexagonBitSimplify();
+ FunctionPass *createHexagonCallFrameInformation();
FunctionPass *createHexagonCFGOptimizer();
FunctionPass *createHexagonCommonGEP();
FunctionPass *createHexagonCopyToCombine();
+ FunctionPass *createHexagonEarlyIfConversion();
FunctionPass *createHexagonExpandCondsets();
FunctionPass *createHexagonExpandPredSpillCode();
FunctionPass *createHexagonFixupHwLoops();
FunctionPass *createHexagonGenExtract();
FunctionPass *createHexagonGenInsert();
+ FunctionPass *createHexagonGenMux();
FunctionPass *createHexagonGenPredicate();
FunctionPass *createHexagonHardwareLoops();
FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
CodeGenOpt::Level OptLevel);
+ FunctionPass *createHexagonLoopRescheduling();
FunctionPass *createHexagonNewValueJump();
+ FunctionPass *createHexagonOptimizeSZextends();
FunctionPass *createHexagonPacketizer();
FunctionPass *createHexagonPeephole();
- FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
FunctionPass *createHexagonSplitConst32AndConst64();
+ FunctionPass *createHexagonSplitDoubleRegs();
+ FunctionPass *createHexagonStoreWidening();
} // end namespace llvm;
/// HexagonTargetMachine ctor - Create an ILP32 architecture model.
@@ -101,13 +126,46 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
const TargetOptions &Options,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL)
- : LLVMTargetMachine(T, "e-m:e-p:32:32-i1:32-i64:64-a:0-n32", TT, CPU, FS,
- Options, RM, CM, OL),
- TLOF(make_unique<HexagonTargetObjectFile>()),
- Subtarget(TT, CPU, FS, *this) {
- initAsmInfo();
+ : LLVMTargetMachine(T, "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-"
+ "i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-"
+ "n16:32", TT, CPU, FS, Options, RM, CM, OL),
+ TLOF(make_unique<HexagonTargetObjectFile>()) {
+ initAsmInfo();
+}
+
+const HexagonSubtarget *
+HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
+ AttributeSet FnAttrs = F.getAttributes();
+ Attribute CPUAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+ Attribute FSAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+ }
+ return I.get();
+}
+
+TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(HexagonTTIImpl(this, F));
+ });
}
+
HexagonTargetMachine::~HexagonTargetMachine() {}
namespace {
@@ -166,7 +224,7 @@ bool HexagonPassConfig::addInstSelector() {
bool NoOpt = (getOptLevel() == CodeGenOpt::None);
if (!NoOpt)
- addPass(createHexagonRemoveExtendArgs(TM));
+ addPass(createHexagonOptimizeSZextends());
addPass(createHexagonISelDag(TM, getOptLevel()));
@@ -174,19 +232,33 @@ bool HexagonPassConfig::addInstSelector() {
// Create logical operations on predicate registers.
if (EnableGenPred)
addPass(createHexagonGenPredicate(), false);
+ // Rotate loops to expose bit-simplification opportunities.
+ if (EnableLoopResched)
+ addPass(createHexagonLoopRescheduling(), false);
+ // Split double registers.
+ if (!DisableHSDR)
+ addPass(createHexagonSplitDoubleRegs());
+ // Bit simplification.
+ if (EnableBitSimplify)
+ addPass(createHexagonBitSimplify(), false);
addPass(createHexagonPeephole());
printAndVerify("After hexagon peephole pass");
if (EnableGenInsert)
addPass(createHexagonGenInsert(), false);
+ if (EnableEarlyIf)
+ addPass(createHexagonEarlyIfConversion(), false);
}
return false;
}
void HexagonPassConfig::addPreRegAlloc() {
- if (getOptLevel() != CodeGenOpt::None)
+ if (getOptLevel() != CodeGenOpt::None) {
+ if (!DisableStoreWidening)
+ addPass(createHexagonStoreWidening(), false);
if (!DisableHardwareLoops)
addPass(createHexagonHardwareLoops(), false);
+ }
}
void HexagonPassConfig::addPostRegAlloc() {
@@ -215,6 +287,13 @@ void HexagonPassConfig::addPreEmitPass() {
if (!NoOpt) {
if (!DisableHardwareLoops)
addPass(createHexagonFixupHwLoops(), false);
+ // Generate MUX from pairs of conditional transfers.
+ if (EnableGenMux)
+ addPass(createHexagonGenMux(), false);
+
addPass(createHexagonPacketizer(), false);
}
+
+ // Add CFI instructions if necessary.
+ addPass(createHexagonCallFrameInformation(), false);
}
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 115eadb98c33..968814b3ea32 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -16,6 +16,7 @@
#include "HexagonInstrInfo.h"
#include "HexagonSubtarget.h"
+#include "HexagonTargetObjectFile.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -24,7 +25,7 @@ class Module;
class HexagonTargetMachine : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- HexagonSubtarget Subtarget;
+ mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
public:
HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -32,20 +33,18 @@ public:
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL);
~HexagonTargetMachine() override;
- const HexagonSubtarget *getSubtargetImpl(const Function &) const override {
- return &Subtarget;
- }
+ const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
+
static unsigned getModuleMatchQuality(const Module &M);
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetIRAnalysis getTargetIRAnalysis() override;
- TargetLoweringObjectFile *getObjFileLowering() const override {
- return TLOF.get();
+ HexagonTargetObjectFile *getObjFileLowering() const override {
+ return static_cast<HexagonTargetObjectFile*>(TLOF.get());
}
};
-extern bool flag_aligned_memcpy;
-
} // end namespace llvm
#endif
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 4ea0e0d11998..ccca62021f5b 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -73,9 +73,10 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
if (!GVA)
return false;
- if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
+ if (Kind.isBSS() || Kind.isData() || Kind.isCommon()) {
Type *Ty = GV->getType()->getElementType();
- return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+ return IsInSmallSection(
+ GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
}
return false;
@@ -89,7 +90,7 @@ HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
// Handle Small Section classification here.
if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
return SmallBSSSection;
- if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+ if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind))
return SmallDataSection;
// Otherwise, we work the same as ELF.
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
new file mode 100644
index 000000000000..a05443eb83b8
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -0,0 +1,38 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagontti"
+
+TargetTransformInfo::PopcntSupportKind
+HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
+ // Return Fast Hardware support as every input < 64 bits will be promoted
+ // to 64 bits.
+ return TargetTransformInfo::PSK_FastHardware;
+}
+
+// The Hexagon target can unroll loops with run-time trip counts.
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+ TTI::UnrollingPreferences &UP) {
+ UP.Runtime = UP.Partial = true;
+}
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
+ return vector ? 0 : 32;
+}
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
new file mode 100644
index 000000000000..71ae17a19e5f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -0,0 +1,70 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+ typedef BasicTTIImplBase<HexagonTTIImpl> BaseT;
+ typedef TargetTransformInfo TTI;
+ friend BaseT;
+
+ const HexagonSubtarget *ST;
+ const HexagonTargetLowering *TLI;
+
+ const HexagonSubtarget *getST() const { return ST; }
+ const HexagonTargetLowering *getTLI() const { return TLI; }
+
+public:
+ explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
+
+ // Provide value semantics. MSVC requires that we spell all of these out.
+ HexagonTTIImpl(const HexagonTTIImpl &Arg)
+ : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+ HexagonTTIImpl(HexagonTTIImpl &&Arg)
+ : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+ TLI(std::move(Arg.TLI)) {}
+
+ /// \name Scalar TTI Implementations
+ /// @{
+
+ TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+
+ // The Hexagon target can unroll loops with run-time trip counts.
+ void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ unsigned getNumberOfRegisters(bool vector) const;
+
+ /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index b91a3f6f8c6c..81850548bb6e 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -16,35 +16,19 @@
// prune the dependence.
//
//===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/DFAPacketizer.h"
-#include "Hexagon.h"
-#include "HexagonMachineFunctionInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
#include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "HexagonVLIWPacketizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <map>
#include <vector>
@@ -52,9 +36,22 @@ using namespace llvm;
#define DEBUG_TYPE "packets"
+static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
+ cl::ZeroOrMore, cl::init(false),
+ cl::desc("Disable Hexagon packetizer pass"));
+
static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
- cl::ZeroOrMore, cl::Hidden, cl::init(true),
- cl::desc("Allow non-solo packetization of volatile memory references"));
+ cl::ZeroOrMore, cl::Hidden, cl::init(true),
+ cl::desc("Allow non-solo packetization of volatile memory references"));
+
+static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false),
+ cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC"));
+
+static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores",
+ cl::init(false), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Disable vector double new-value-stores"));
+
+extern cl::opt<bool> ScheduleInlineAsm;
namespace llvm {
FunctionPass *createHexagonPacketizer();
@@ -64,7 +61,6 @@ namespace llvm {
namespace {
class HexagonPacketizer : public MachineFunctionPass {
-
public:
static char ID;
HexagonPacketizer() : MachineFunctionPass(ID) {
@@ -73,103 +69,25 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MachineBranchProbabilityInfo>();
- AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineDominatorTree>();
AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineDominatorTree>();
AU.addPreserved<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
-
const char *getPassName() const override {
return "Hexagon Packetizer";
}
-
bool runOnMachineFunction(MachineFunction &Fn) override;
- };
- char HexagonPacketizer::ID = 0;
-
- class HexagonPacketizerList : public VLIWPacketizerList {
private:
-
- // Has the instruction been promoted to a dot-new instruction.
- bool PromotedToDotNew;
-
- // Has the instruction been glued to allocframe.
- bool GlueAllocframeStore;
-
- // Has the feeder instruction been glued to new value jump.
- bool GlueToNewValueJump;
-
- // Check if there is a dependence between some instruction already in this
- // packet and this instruction.
- bool Dependence;
-
- // Only check for dependence if there are resources available to
- // schedule this instruction.
- bool FoundSequentialDependence;
-
- /// \brief A handle to the branch probability pass.
- const MachineBranchProbabilityInfo *MBPI;
-
- // Track MIs with ignored dependece.
- std::vector<MachineInstr*> IgnoreDepMIs;
-
- public:
- // Ctor.
- HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
- const MachineBranchProbabilityInfo *MBPI);
-
- // initPacketizerState - initialize some internal flags.
- void initPacketizerState() override;
-
- // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
- bool ignorePseudoInstruction(MachineInstr *MI,
- MachineBasicBlock *MBB) override;
-
- // isSoloInstruction - return true if instruction MI can not be packetized
- // with any other instruction, which means that MI itself is a packet.
- bool isSoloInstruction(MachineInstr *MI) override;
-
- // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
- // together.
- bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
-
- // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
- // and SUJ.
- bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
-
- MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
- private:
- bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
- bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
- MachineBasicBlock::iterator &MII,
- const TargetRegisterClass* RC);
- bool CanPromoteToDotNew(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit,
- MachineBasicBlock::iterator &MII,
- const TargetRegisterClass *RC);
- bool
- CanPromoteToNewValue(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit,
- MachineBasicBlock::iterator &MII);
- bool CanPromoteToNewValueStore(
- MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit);
- bool DemoteToDotOld(MachineInstr *MI);
- bool ArePredicatesComplements(
- MachineInstr *MI1, MachineInstr *MI2,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit);
- bool RestrictingDepExistInPacket(MachineInstr *, unsigned,
- const std::map<MachineInstr *, SUnit *> &);
- bool isNewifiable(MachineInstr* MI);
- bool isCondInst(MachineInstr* MI);
- bool tryAllocateResourcesForConstExt(MachineInstr* MI);
- bool canReserveResourcesForConstExt(MachineInstr *MI);
- void reserveResourcesForConstExt(MachineInstr* MI);
- bool isNewValueInst(MachineInstr* MI);
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
};
+
+ char HexagonPacketizer::ID = 0;
}
INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
@@ -177,26 +95,93 @@ INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
false, false)
-// HexagonPacketizerList Ctor.
-HexagonPacketizerList::HexagonPacketizerList(
- MachineFunction &MF, MachineLoopInfo &MLI,
- const MachineBranchProbabilityInfo *MBPI)
- : VLIWPacketizerList(MF, MLI, true) {
- this->MBPI = MBPI;
+HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
+ MachineLoopInfo &MLI, AliasAnalysis *AA,
+ const MachineBranchProbabilityInfo *MBPI)
+ : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) {
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
}
-bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
- const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
- MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
- const MachineBranchProbabilityInfo *MBPI =
- &getAnalysis<MachineBranchProbabilityInfo>();
+// Check if FirstI modifies a register that SecondI reads.
+static bool hasWriteToReadDep(const MachineInstr *FirstI,
+ const MachineInstr *SecondI, const TargetRegisterInfo *TRI) {
+ for (auto &MO : FirstI->operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned R = MO.getReg();
+ if (SecondI->readsRegister(R, TRI))
+ return true;
+ }
+ return false;
+}
+
+
+static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI,
+ MachineBasicBlock::iterator BundleIt, bool Before) {
+ MachineBasicBlock::instr_iterator InsertPt;
+ if (Before)
+ InsertPt = BundleIt.getInstrIterator();
+ else
+ InsertPt = std::next(BundleIt).getInstrIterator();
+
+ MachineBasicBlock &B = *MI->getParent();
+ // The instruction should at least be bundled with the preceding instruction
+ // (there will always be one, i.e. BUNDLE, if nothing else).
+ assert(MI->isBundledWithPred());
+ if (MI->isBundledWithSucc()) {
+ MI->clearFlag(MachineInstr::BundledSucc);
+ MI->clearFlag(MachineInstr::BundledPred);
+ } else {
+ // If it's not bundled with the successor (i.e. it is the last one
+ // in the bundle), then we can simply unbundle it from the predecessor,
+ // which will take care of updating the predecessor's flag.
+ MI->unbundleFromPred();
+ }
+ B.splice(InsertPt, &B, MI);
+
+ // Get the size of the bundle without asserting.
+ MachineBasicBlock::const_instr_iterator I(BundleIt);
+ MachineBasicBlock::const_instr_iterator E = B.instr_end();
+ unsigned Size = 0;
+ for (++I; I != E && I->isBundledWithPred(); ++I)
+ ++Size;
+
+ // If there are still two or more instructions, then there is nothing
+ // else to be done.
+ if (Size > 1)
+ return BundleIt;
+
+ // Otherwise, extract the single instruction out and delete the bundle.
+ MachineBasicBlock::iterator NextIt = std::next(BundleIt);
+ MachineInstr *SingleI = BundleIt->getNextNode();
+ SingleI->unbundleFromPred();
+ assert(!SingleI->isBundledWithSucc());
+ BundleIt->eraseFromParent();
+ return NextIt;
+}
+
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
+ if (DisablePacketizer)
+ return false;
+
+ HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+ HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+ auto &MLI = getAnalysis<MachineLoopInfo>();
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+ if (EnableGenAllInsnClass)
+ HII->genAllInsnTimingClasses(MF);
+
// Instantiate the packetizer.
- HexagonPacketizerList Packetizer(Fn, MLI, MBPI);
+ HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI);
// DFA state table should not be empty.
assert(Packetizer.getResourceTracker() && "Empty DFA table!");
@@ -211,162 +196,107 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
// dependence between Insn 0 and Insn 2. This can lead to incorrect
// packetization
//
- for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
- MBB != MBBe; ++MBB) {
- MachineBasicBlock::iterator End = MBB->end();
- MachineBasicBlock::iterator MI = MBB->begin();
+ for (auto &MB : MF) {
+ auto End = MB.end();
+ auto MI = MB.begin();
while (MI != End) {
+ auto NextI = std::next(MI);
if (MI->isKill()) {
- MachineBasicBlock::iterator DeleteMI = MI;
- ++MI;
- MBB->erase(DeleteMI);
- End = MBB->end();
- continue;
+ MB.erase(MI);
+ End = MB.end();
}
- ++MI;
+ MI = NextI;
}
}
// Loop over all of the basic blocks.
- for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
- MBB != MBBe; ++MBB) {
- // Find scheduling regions and schedule / packetize each region.
- unsigned RemainingCount = MBB->size();
- for(MachineBasicBlock::iterator RegionEnd = MBB->end();
- RegionEnd != MBB->begin();) {
- // The next region starts above the previous region. Look backward in the
- // instruction stream until we find the nearest boundary.
- MachineBasicBlock::iterator I = RegionEnd;
- for(;I != MBB->begin(); --I, --RemainingCount) {
- if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
- break;
- }
- I = MBB->begin();
-
- // Skip empty scheduling regions.
- if (I == RegionEnd) {
- RegionEnd = std::prev(RegionEnd);
- --RemainingCount;
- continue;
- }
- // Skip regions with one instruction.
- if (I == std::prev(RegionEnd)) {
- RegionEnd = std::prev(RegionEnd);
- continue;
- }
-
- Packetizer.PacketizeMIs(MBB, I, RegionEnd);
- RegionEnd = I;
+ for (auto &MB : MF) {
+ auto Begin = MB.begin(), End = MB.end();
+ while (Begin != End) {
+ // First the first non-boundary starting from the end of the last
+ // scheduling region.
+ MachineBasicBlock::iterator RB = Begin;
+ while (RB != End && HII->isSchedulingBoundary(RB, &MB, MF))
+ ++RB;
+ // First the first boundary starting from the beginning of the new
+ // region.
+ MachineBasicBlock::iterator RE = RB;
+ while (RE != End && !HII->isSchedulingBoundary(RE, &MB, MF))
+ ++RE;
+ // Add the scheduling boundary if it's not block end.
+ if (RE != End)
+ ++RE;
+ // If RB == End, then RE == End.
+ if (RB != End)
+ Packetizer.PacketizeMIs(&MB, RB, RE);
+
+ Begin = RE;
}
}
+ Packetizer.unpacketizeSoloInstrs(MF);
return true;
}
-static bool IsIndirectCall(MachineInstr* MI) {
- return MI->getOpcode() == Hexagon::J2_callr;
+// Reserve resources for a constant extender. Trigger an assertion if the
+// reservation fails.
+void HexagonPacketizerList::reserveResourcesForConstExt() {
+ if (!tryAllocateResourcesForConstExt(true))
+ llvm_unreachable("Resources not available");
}
-// Reserve resources for constant extender. Trigure an assertion if
-// reservation fail.
-void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
- MI->getDebugLoc());
-
- if (ResourceTracker->canReserveResources(PseudoMI)) {
- ResourceTracker->reserveResources(PseudoMI);
- MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
- } else {
- MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
- llvm_unreachable("can not reserve resources for constant extender.");
- }
- return;
+bool HexagonPacketizerList::canReserveResourcesForConstExt() {
+ return tryAllocateResourcesForConstExt(false);
}
-bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
- "Should only be called for constant extended instructions");
- MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
- MI->getDebugLoc());
- bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
- MF.DeleteMachineInstr(PseudoMI);
- return CanReserve;
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded,
+// return true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
+ auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc());
+ bool Avail = ResourceTracker->canReserveResources(ExtMI);
+ if (Reserve && Avail)
+ ResourceTracker->reserveResources(ExtMI);
+ MF.DeleteMachineInstr(ExtMI);
+ return Avail;
}
-// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
-// true, otherwise, return false.
-bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext),
- MI->getDebugLoc());
- if (ResourceTracker->canReserveResources(PseudoMI)) {
- ResourceTracker->reserveResources(PseudoMI);
- MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+bool HexagonPacketizerList::isCallDependent(const MachineInstr* MI,
+ SDep::Kind DepType, unsigned DepReg) {
+ // Check for LR dependence.
+ if (DepReg == HRI->getRARegister())
return true;
- } else {
- MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
- return false;
- }
-}
-
-
-bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
- SDep::Kind DepType,
- unsigned DepReg) {
-
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- const HexagonRegisterInfo *QRI =
- (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
-
- // Check for lr dependence
- if (DepReg == QRI->getRARegister()) {
- return true;
- }
- if (QII->isDeallocRet(MI)) {
- if (DepReg == QRI->getFrameRegister() ||
- DepReg == QRI->getStackRegister())
+ if (HII->isDeallocRet(MI))
+ if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
return true;
- }
- // Check if this is a predicate dependence
- const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
- if (RC == &Hexagon::PredRegsRegClass) {
+ // Check if this is a predicate dependence.
+ const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
+ if (RC == &Hexagon::PredRegsRegClass)
return true;
- }
- //
- // Lastly check for an operand used in an indirect call
- // If we had an attribute for checking if an instruction is an indirect call,
- // then we could have avoided this relatively brittle implementation of
- // IsIndirectCall()
- //
- // Assumes that the first operand of the CALLr is the function address
- //
- if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+ // Assumes that the first operand of the CALLr is the function address.
+ if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
MachineOperand MO = MI->getOperand(0);
- if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+ if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
return true;
- }
}
return false;
}
-static bool IsRegDependence(const SDep::Kind DepType) {
- return (DepType == SDep::Data || DepType == SDep::Anti ||
- DepType == SDep::Output);
+static bool isRegDependence(const SDep::Kind DepType) {
+ return DepType == SDep::Data || DepType == SDep::Anti ||
+ DepType == SDep::Output;
}
-static bool IsDirectJump(MachineInstr* MI) {
- return (MI->getOpcode() == Hexagon::J2_jump);
+static bool isDirectJump(const MachineInstr* MI) {
+ return MI->getOpcode() == Hexagon::J2_jump;
}
-static bool IsSchedBarrier(MachineInstr* MI) {
+static bool isSchedBarrier(const MachineInstr* MI) {
switch (MI->getOpcode()) {
case Hexagon::Y2_barrier:
return true;
@@ -374,76 +304,127 @@ static bool IsSchedBarrier(MachineInstr* MI) {
return false;
}
-static bool IsControlFlow(MachineInstr* MI) {
+static bool isControlFlow(const MachineInstr* MI) {
return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
}
-static bool IsLoopN(MachineInstr *MI) {
- return (MI->getOpcode() == Hexagon::J2_loop0i ||
- MI->getOpcode() == Hexagon::J2_loop0r);
-}
-/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
-/// callee-saved register.
-static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+/// Returns true if the instruction modifies a callee-saved register.
+static bool doesModifyCalleeSavedReg(const MachineInstr *MI,
const TargetRegisterInfo *TRI) {
- for (const MCPhysReg *CSR =
- TRI->getCalleeSavedRegs(MI->getParent()->getParent());
- *CSR; ++CSR) {
- unsigned CalleeSavedReg = *CSR;
- if (MI->modifiesRegister(CalleeSavedReg, TRI))
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+ if (MI->modifiesRegister(*CSR, TRI))
return true;
- }
return false;
}
-// Returns true if an instruction can be promoted to .new predicate
-// or new-value store.
-bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- return isCondInst(MI) || QII->mayBeNewStore(MI);
+// TODO: MI->isIndirectBranch() and IsRegisterJump(MI)
+// Returns true if an instruction can be promoted to .new predicate or
+// new-value store.
+bool HexagonPacketizerList::isNewifiable(const MachineInstr* MI) {
+ return HII->isCondInst(MI) || MI->isReturn() || HII->mayBeNewStore(MI);
}
-bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- const MCInstrDesc& TID = MI->getDesc();
- // bug 5670: until that is fixed,
- // this portion is disabled.
- if ( TID.isConditionalBranch() // && !IsRegisterJump(MI)) ||
- || QII->isConditionalTransfer(MI)
- || QII->isConditionalALU32(MI)
- || QII->isConditionalLoad(MI)
- || QII->isConditionalStore(MI)) {
- return true;
+// Promote an instructiont to its .cur form.
+// At this time, we have already made a call to canPromoteToDotCur and made
+// sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::promoteToDotCur(MachineInstr* MI,
+ SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
+ assert(DepType == SDep::Data);
+ int CurOpcode = HII->getDotCurOp(MI);
+ MI->setDesc(HII->get(CurOpcode));
+ return true;
+}
+
+void HexagonPacketizerList::cleanUpDotCur() {
+ MachineInstr *MI = NULL;
+ for (auto BI : CurrentPacketMIs) {
+ DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+ if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) {
+ MI = BI;
+ continue;
+ }
+ if (MI) {
+ for (auto &MO : BI->operands())
+ if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg())
+ return;
+ }
}
- return false;
+ if (!MI)
+ return;
+ // We did not find a use of the CUR, so de-cur it.
+ MI->setDesc(HII->get(Hexagon::V6_vL32b_ai));
+ DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
}
+// Check to see if an instruction can be dot cur.
+bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI,
+ const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass *RC) {
+ if (!HII->isV60VectorInstruction(MI))
+ return false;
+ if (!HII->isV60VectorInstruction(MII))
+ return false;
-// Promote an instructiont to its .new form.
-// At this time, we have already made a call to CanPromoteToDotNew
-// and made sure that it can *indeed* be promoted.
-bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
- SDep::Kind DepType, MachineBasicBlock::iterator &MII,
- const TargetRegisterClass* RC) {
+ // Already a dot new instruction.
+ if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI))
+ return false;
- assert (DepType == SDep::Data);
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+ if (!HII->mayBeCurLoad(MI))
+ return false;
+
+ // The "cur value" cannot come from inline asm.
+ if (PacketSU->getInstr()->isInlineAsm())
+ return false;
+
+ // Make sure candidate instruction uses cur.
+ DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
+ MI->dump();
+ dbgs() << "in packet\n";);
+ MachineInstr *MJ = MII;
+ DEBUG(dbgs() << "Checking CUR against "; MJ->dump(););
+ unsigned DestReg = MI->getOperand(0).getReg();
+ bool FoundMatch = false;
+ for (auto &MO : MJ->operands())
+ if (MO.isReg() && MO.getReg() == DestReg)
+ FoundMatch = true;
+ if (!FoundMatch)
+ return false;
+
+ // Check for existing uses of a vector register within the packet which
+ // would be affected by converting a vector load into .cur formt.
+ for (auto BI : CurrentPacketMIs) {
+ DEBUG(dbgs() << "packet has "; BI->dump(););
+ if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
+ return false;
+ }
+
+ DEBUG(dbgs() << "Can Dot CUR MI\n"; MI->dump(););
+ // We can convert the opcode into a .cur.
+ return true;
+}
+// Promote an instruction to its .new form. At this time, we have already
+// made a call to canPromoteToDotNew and made sure that it can *indeed* be
+// promoted.
+bool HexagonPacketizerList::promoteToDotNew(MachineInstr* MI,
+ SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
+ assert (DepType == SDep::Data);
int NewOpcode;
if (RC == &Hexagon::PredRegsRegClass)
- NewOpcode = QII->GetDotNewPredOp(MI, MBPI);
+ NewOpcode = HII->getDotNewPredOp(MI, MBPI);
else
- NewOpcode = QII->GetDotNewOp(MI);
- MI->setDesc(QII->get(NewOpcode));
-
+ NewOpcode = HII->getDotNewOp(MI);
+ MI->setDesc(HII->get(NewOpcode));
return true;
}
-bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- int NewOpcode = QII->GetDotOldOp(MI->getOpcode());
- MI->setDesc(QII->get(NewOpcode));
+bool HexagonPacketizerList::demoteToDotOld(MachineInstr* MI) {
+ int NewOpcode = HII->getDotOldOp(MI->getOpcode());
+ MI->setDesc(HII->get(NewOpcode));
return true;
}
@@ -455,175 +436,173 @@ enum PredicateKind {
/// Returns true if an instruction is predicated on p0 and false if it's
/// predicated on !p0.
-static PredicateKind getPredicateSense(MachineInstr* MI,
- const HexagonInstrInfo *QII) {
- if (!QII->isPredicated(MI))
+static PredicateKind getPredicateSense(const MachineInstr *MI,
+ const HexagonInstrInfo *HII) {
+ if (!HII->isPredicated(MI))
return PK_Unknown;
-
- if (QII->isPredicatedTrue(MI))
+ if (HII->isPredicatedTrue(MI))
return PK_True;
-
return PK_False;
}
-static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
- const HexagonInstrInfo *QII) {
- assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+static const MachineOperand &getPostIncrementOperand(const MachineInstr *MI,
+ const HexagonInstrInfo *HII) {
+ assert(HII->isPostIncrement(MI) && "Not a post increment operation.");
#ifndef NDEBUG
// Post Increment means duplicates. Use dense map to find duplicates in the
// list. Caution: Densemap initializes with the minimum of 64 buckets,
// whereas there are at most 5 operands in the post increment.
- DenseMap<unsigned, unsigned> DefRegsSet;
- for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
- if (MI->getOperand(opNum).isReg() &&
- MI->getOperand(opNum).isDef()) {
- DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
- }
-
- for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
- if (MI->getOperand(opNum).isReg() &&
- MI->getOperand(opNum).isUse()) {
- if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
- return MI->getOperand(opNum);
- }
- }
+ DenseSet<unsigned> DefRegsSet;
+ for (auto &MO : MI->operands())
+ if (MO.isReg() && MO.isDef())
+ DefRegsSet.insert(MO.getReg());
+
+ for (auto &MO : MI->operands())
+ if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg()))
+ return MO;
#else
- if (MI->getDesc().mayLoad()) {
+ if (MI->mayLoad()) {
+ const MachineOperand &Op1 = MI->getOperand(1);
// The 2nd operand is always the post increment operand in load.
- assert(MI->getOperand(1).isReg() &&
- "Post increment operand has be to a register.");
- return (MI->getOperand(1));
+ assert(Op1.isReg() && "Post increment operand has be to a register.");
+ return Op1;
}
if (MI->getDesc().mayStore()) {
+ const MachineOperand &Op0 = MI->getOperand(0);
// The 1st operand is always the post increment operand in store.
- assert(MI->getOperand(0).isReg() &&
- "Post increment operand has be to a register.");
- return (MI->getOperand(0));
+ assert(Op0.isReg() && "Post increment operand has be to a register.");
+ return Op0;
}
#endif
// we should never come here.
llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
}
-// get the value being stored
-static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+// Get the value being stored.
+static const MachineOperand& getStoreValueOperand(const MachineInstr *MI) {
// value being stored is always the last operand.
- return (MI->getOperand(MI->getNumOperands()-1));
+ return MI->getOperand(MI->getNumOperands()-1);
+}
+
+static bool isLoadAbsSet(const MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::L4_loadrd_ap:
+ case Hexagon::L4_loadrb_ap:
+ case Hexagon::L4_loadrh_ap:
+ case Hexagon::L4_loadrub_ap:
+ case Hexagon::L4_loadruh_ap:
+ case Hexagon::L4_loadri_ap:
+ return true;
+ }
+ return false;
}
-// can be new value store?
+static const MachineOperand &getAbsSetOperand(const MachineInstr *MI) {
+ assert(isLoadAbsSet(MI));
+ return MI->getOperand(1);
+}
+
+
+// Can be new value store?
// Following restrictions are to be respected in convert a store into
// a new value store.
// 1. If an instruction uses auto-increment, its address register cannot
// be a new-value register. Arch Spec 5.4.2.1
-// 2. If an instruction uses absolute-set addressing mode,
-// its address register cannot be a new-value register.
-// Arch Spec 5.4.2.1.TODO: This is not enabled as
-// as absolute-set address mode patters are not implemented.
+// 2. If an instruction uses absolute-set addressing mode, its address
+// register cannot be a new-value register. Arch Spec 5.4.2.1.
// 3. If an instruction produces a 64-bit result, its registers cannot be used
// as new-value registers. Arch Spec 5.4.2.2.
-// 4. If the instruction that sets a new-value register is conditional, then
+// 4. If the instruction that sets the new-value register is conditional, then
// the instruction that uses the new-value register must also be conditional,
// and both must always have their predicates evaluate identically.
// Arch Spec 5.4.2.3.
-// 5. There is an implied restriction of a packet can not have another store,
-// if there is a new value store in the packet. Corollary, if there is
+// 5. There is an implied restriction that a packet cannot have another store,
+// if there is a new value store in the packet. Corollary: if there is
// already a store in a packet, there can not be a new value store.
// Arch Spec: 3.4.4.2
-bool HexagonPacketizerList::CanPromoteToNewValueStore(
- MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI,
+ const MachineInstr *PacketMI, unsigned DepReg) {
// Make sure we are looking at the store, that can be promoted.
- if (!QII->mayBeNewStore(MI))
+ if (!HII->mayBeNewStore(MI))
return false;
- // Make sure there is dependency and can be new value'ed
- if (GetStoreValueOperand(MI).isReg() &&
- GetStoreValueOperand(MI).getReg() != DepReg)
+ // Make sure there is dependency and can be new value'd.
+ const MachineOperand &Val = getStoreValueOperand(MI);
+ if (Val.isReg() && Val.getReg() != DepReg)
return false;
- const HexagonRegisterInfo *QRI =
- (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
const MCInstrDesc& MCID = PacketMI->getDesc();
- // first operand is always the result
-
- const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
-
- // if there is already an store in the packet, no can do new value store
- // Arch Spec 3.4.4.2.
- for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
- VE = CurrentPacketMIs.end();
- (VI != VE); ++VI) {
- SUnit *PacketSU = MIToSUnit.find(*VI)->second;
- if (PacketSU->getInstr()->getDesc().mayStore() ||
- // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
- // then we don't need this
- PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
- PacketSU->getInstr()->getOpcode() == Hexagon::L2_deallocframe)
- return false;
- }
- if (PacketRC == &Hexagon::DoubleRegsRegClass) {
- // new value store constraint: double regs can not feed into new value store
- // arch spec section: 5.4.2.2
+ // First operand is always the result.
+ const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF);
+ // Double regs can not feed into new value store: PRM section: 5.4.2.2.
+ if (PacketRC == &Hexagon::DoubleRegsRegClass)
return false;
+
+ // New-value stores are of class NV (slot 0), dual stores require class ST
+ // in slot 0 (PRM 5.5).
+ for (auto I : CurrentPacketMIs) {
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
+ if (PacketSU->getInstr()->mayStore())
+ return false;
}
// Make sure it's NOT the post increment register that we are going to
// new value.
- if (QII->isPostIncrement(MI) &&
- MI->getDesc().mayStore() &&
- GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+ if (HII->isPostIncrement(MI) &&
+ getPostIncrementOperand(MI, HII).getReg() == DepReg) {
return false;
}
- if (QII->isPostIncrement(PacketMI) &&
- PacketMI->getDesc().mayLoad() &&
- GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
- // if source is post_inc, or absolute-set addressing,
- // it can not feed into new value store
- // r3 = memw(r2++#4)
- // memw(r30 + #-1404) = r2.new -> can not be new value store
- // arch spec section: 5.4.2.1
+ if (HII->isPostIncrement(PacketMI) && PacketMI->mayLoad() &&
+ getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) {
+ // If source is post_inc, or absolute-set addressing, it can not feed
+ // into new value store
+ // r3 = memw(r2++#4)
+ // memw(r30 + #-1404) = r2.new -> can not be new value store
+ // arch spec section: 5.4.2.1.
return false;
}
+ if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg)
+ return false;
+
// If the source that feeds the store is predicated, new value store must
// also be predicated.
- if (QII->isPredicated(PacketMI)) {
- if (!QII->isPredicated(MI))
+ if (HII->isPredicated(PacketMI)) {
+ if (!HII->isPredicated(MI))
return false;
// Check to make sure that they both will have their predicates
- // evaluate identically
+ // evaluate identically.
unsigned predRegNumSrc = 0;
unsigned predRegNumDst = 0;
const TargetRegisterClass* predRegClass = nullptr;
- // Get predicate register used in the source instruction
- for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
- if ( PacketMI->getOperand(opNum).isReg())
- predRegNumSrc = PacketMI->getOperand(opNum).getReg();
- predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
- if (predRegClass == &Hexagon::PredRegsRegClass) {
+ // Get predicate register used in the source instruction.
+ for (auto &MO : PacketMI->operands()) {
+ if (!MO.isReg())
+ continue;
+ predRegNumSrc = MO.getReg();
+ predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc);
+ if (predRegClass == &Hexagon::PredRegsRegClass)
break;
- }
}
- assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
- ("predicate register not found in a predicated PacketMI instruction"));
-
- // Get predicate register used in new-value store instruction
- for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
- if ( MI->getOperand(opNum).isReg())
- predRegNumDst = MI->getOperand(opNum).getReg();
- predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
- if (predRegClass == &Hexagon::PredRegsRegClass) {
+ assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+ "predicate register not found in a predicated PacketMI instruction");
+
+ // Get predicate register used in new-value store instruction.
+ for (auto &MO : MI->operands()) {
+ if (!MO.isReg())
+ continue;
+ predRegNumDst = MO.getReg();
+ predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst);
+ if (predRegClass == &Hexagon::PredRegsRegClass)
break;
- }
}
- assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
- ("predicate register not found in a predicated MI instruction"));
+ assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+ "predicate register not found in a predicated MI instruction");
// New-value register producer and user (store) need to satisfy these
// constraints:
@@ -632,13 +611,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore(
// should also be .new predicated and if producer is not .new predicated
// then store should not be .new predicated.
// 3) Both new-value register producer and user should have same predicate
- // sense, i.e, either both should be negated or both should be none negated.
-
- if (( predRegNumDst != predRegNumSrc) ||
- QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI) ||
- getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) {
+ // sense, i.e, either both should be negated or both should be non-negated.
+ if (predRegNumDst != predRegNumSrc ||
+ HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) ||
+ getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII))
return false;
- }
}
// Make sure that other than the new-value register no other store instruction
@@ -649,81 +626,77 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore(
// including PacketMI. Howerver, we need to perform the check for the
// remaining instructions in the packet.
- std::vector<MachineInstr*>::iterator VI;
- std::vector<MachineInstr*>::iterator VE;
unsigned StartCheck = 0;
- for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
- (VI != VE); ++VI) {
- SUnit *TempSU = MIToSUnit.find(*VI)->second;
+ for (auto I : CurrentPacketMIs) {
+ SUnit *TempSU = MIToSUnit.find(I)->second;
MachineInstr* TempMI = TempSU->getInstr();
// Following condition is true for all the instructions until PacketMI is
// reached (StartCheck is set to 0 before the for loop).
// StartCheck flag is 1 for all the instructions after PacketMI.
- if (TempMI != PacketMI && !StartCheck) // start processing only after
- continue; // encountering PacketMI
+ if (TempMI != PacketMI && !StartCheck) // Start processing only after
+ continue; // encountering PacketMI.
StartCheck = 1;
- if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+ if (TempMI == PacketMI) // We don't want to check PacketMI for dependence.
continue;
- for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
- if (MI->getOperand(opNum).isReg() &&
- TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
- QRI))
+ for (auto &MO : MI->operands())
+ if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI))
return false;
- }
}
// Make sure that for non-POST_INC stores:
// 1. The only use of reg is DepReg and no other registers.
// This handles V4 base+index registers.
// The following store can not be dot new.
- // Eg. r0 = add(r0, #3)a
+ // Eg. r0 = add(r0, #3)
// memw(r1+r0<<#2) = r0
- if (!QII->isPostIncrement(MI) &&
- GetStoreValueOperand(MI).isReg() &&
- GetStoreValueOperand(MI).getReg() == DepReg) {
- for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
- if (MI->getOperand(opNum).isReg() &&
- MI->getOperand(opNum).getReg() == DepReg) {
- return false;
- }
- }
- // 2. If data definition is because of implicit definition of the register,
- // do not newify the store. Eg.
- // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
- // STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
- for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
- if (PacketMI->getOperand(opNum).isReg() &&
- PacketMI->getOperand(opNum).getReg() == DepReg &&
- PacketMI->getOperand(opNum).isDef() &&
- PacketMI->getOperand(opNum).isImplicit()) {
+ if (!HII->isPostIncrement(MI)) {
+ for (unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ if (MO.isReg() && MO.getReg() == DepReg)
return false;
- }
}
}
+ // If data definition is because of implicit definition of the register,
+ // do not newify the store. Eg.
+ // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+ // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+ for (auto &MO : PacketMI->operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
+ continue;
+ unsigned R = MO.getReg();
+ if (R == DepReg || HRI->isSuperRegister(DepReg, R))
+ return false;
+ }
+
+ // Handle imp-use of super reg case. There is a target independent side
+ // change that should prevent this situation but I am handling it for
+ // just-in-case. For example, we cannot newify R2 in the following case:
+ // %R3<def> = A2_tfrsi 0;
+ // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>;
+ for (auto &MO : MI->operands()) {
+ if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg)
+ return false;
+ }
+
// Can be dot new store.
return true;
}
-// can this MI to promoted to either
-// new value store or new value jump
-bool HexagonPacketizerList::CanPromoteToNewValue(
- MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit,
- MachineBasicBlock::iterator &MII) {
-
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
- if (!QII->mayBeNewStore(MI))
+// Can this MI to promoted to either new value store or new value jump.
+bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr *MI,
+ const SUnit *PacketSU, unsigned DepReg,
+ MachineBasicBlock::iterator &MII) {
+ if (!HII->mayBeNewStore(MI))
return false;
- MachineInstr *PacketMI = PacketSU->getInstr();
-
// Check to see the store can be new value'ed.
- if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+ MachineInstr *PacketMI = PacketSU->getInstr();
+ if (canPromoteToNewValueStore(MI, PacketMI, DepReg))
return true;
// Check to see the compare/jump can be new value'ed.
@@ -731,93 +704,110 @@ bool HexagonPacketizerList::CanPromoteToNewValue(
return false;
}
+static bool isImplicitDependency(const MachineInstr *I, unsigned DepReg) {
+ for (auto &MO : I->operands())
+ if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+ return true;
+ return false;
+}
+
// Check to see if an instruction can be dot new
// There are three kinds.
// 1. dot new on predicate - V2/V3/V4
// 2. dot new on stores NV/ST - V4
// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
-bool HexagonPacketizerList::CanPromoteToDotNew(
- MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit,
- MachineBasicBlock::iterator &MII, const TargetRegisterClass *RC) {
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr *MI,
+ const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC) {
// Already a dot new instruction.
- if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
+ if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI))
return false;
if (!isNewifiable(MI))
return false;
+ const MachineInstr *PI = PacketSU->getInstr();
+
+ // The "new value" cannot come from inline asm.
+ if (PI->isInlineAsm())
+ return false;
+
+ // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no
+ // sense.
+ if (PI->isImplicitDef())
+ return false;
+
+ // If dependency is trough an implicitly defined register, we should not
+ // newify the use.
+ if (isImplicitDependency(PI, DepReg))
+ return false;
+
+ const MCInstrDesc& MCID = PI->getDesc();
+ const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF);
+ if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass)
+ return false;
+
// predicate .new
- if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
- return true;
- else if (RC != &Hexagon::PredRegsRegClass &&
- !QII->mayBeNewStore(MI)) // MI is not a new-value store
+ // bug 5670: until that is fixed
+ // TODO: MI->isIndirectBranch() and IsRegisterJump(MI)
+ if (RC == &Hexagon::PredRegsRegClass)
+ if (HII->isCondInst(MI) || MI->isReturn())
+ return HII->predCanBeUsedAsDotNew(PI, DepReg);
+
+ if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
+ return false;
+
+ // Create a dot new machine instruction to see if resources can be
+ // allocated. If not, bail out now.
+ int NewOpcode = HII->getDotNewOp(MI);
+ const MCInstrDesc &D = HII->get(NewOpcode);
+ MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
+ bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+ MF.DeleteMachineInstr(NewMI);
+ if (!ResourcesAvailable)
+ return false;
+
+ // New Value Store only. New Value Jump generated as a separate pass.
+ if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII))
return false;
- else {
- // Create a dot new machine instruction to see if resources can be
- // allocated. If not, bail out now.
- int NewOpcode = QII->GetDotNewOp(MI);
- const MCInstrDesc &desc = QII->get(NewOpcode);
- DebugLoc dl;
- MachineInstr *NewMI =
- MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
- bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
- MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
-
- if (!ResourcesAvailable)
- return false;
- // new value store only
- // new new value jump generated as a passes
- if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
- return false;
- }
- }
return true;
}
-// Go through the packet instructions and search for anti dependency
-// between them and DepReg from MI
-// Consider this case:
+// Go through the packet instructions and search for an anti dependency between
+// them and DepReg from MI. Consider this case:
// Trying to add
// a) %R1<def> = TFRI_cdNotPt %P3, 2
// to this packet:
// {
-// b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
-// c) %P3<def> = TFR_PdRs %R23
-// d) %R1<def> = TFRI_cdnPt %P3, 4
+// b) %P0<def> = C2_or %P3<kill>, %P0<kill>
+// c) %P3<def> = C2_tfrrp %R23
+// d) %R1<def> = C2_cmovenewit %P3, 4
// }
// The P3 from a) and d) will be complements after
// a)'s P3 is converted to .new form
-// Anti Dep between c) and b) is irrelevant for this case
-bool HexagonPacketizerList::RestrictingDepExistInPacket(
- MachineInstr *MI, unsigned DepReg,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
-
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+// Anti-dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI,
+ unsigned DepReg) {
SUnit *PacketSUDep = MIToSUnit.find(MI)->second;
- for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
- VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
-
+ for (auto I : CurrentPacketMIs) {
// We only care for dependencies to predicated instructions
- if(!QII->isPredicated(*VIN)) continue;
+ if (!HII->isPredicated(I))
+ continue;
// Scheduling Unit for current insn in the packet
- SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
- // Look at dependencies between current members of the packet
- // and predicate defining instruction MI.
- // Make sure that dependency is on the exact register
- // we care about.
+ // Look at dependencies between current members of the packet and
+ // predicate defining instruction MI. Make sure that dependency is
+ // on the exact register we care about.
if (PacketSU->isSucc(PacketSUDep)) {
for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
- if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
- (PacketSU->Succs[i].getKind() == SDep::Anti) &&
- (PacketSU->Succs[i].getReg() == DepReg)) {
+ auto &Dep = PacketSU->Succs[i];
+ if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti &&
+ Dep.getReg() == DepReg)
return true;
- }
}
}
}
@@ -831,276 +821,362 @@ static unsigned getPredicatedRegister(MachineInstr *MI,
const HexagonInstrInfo *QII) {
/// We use the following rule: The first predicate register that is a use is
/// the predicate register of a predicated instruction.
-
assert(QII->isPredicated(MI) && "Must be predicated instruction");
- for (MachineInstr::mop_iterator OI = MI->operands_begin(),
- OE = MI->operands_end(); OI != OE; ++OI) {
- MachineOperand &Op = *OI;
+ for (auto &Op : MI->operands()) {
if (Op.isReg() && Op.getReg() && Op.isUse() &&
Hexagon::PredRegsRegClass.contains(Op.getReg()))
return Op.getReg();
}
llvm_unreachable("Unknown instruction operand layout");
-
return 0;
}
// Given two predicated instructions, this function detects whether
-// the predicates are complements
-bool HexagonPacketizerList::ArePredicatesComplements(
- MachineInstr *MI1, MachineInstr *MI2,
- const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
-
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-
+// the predicates are complements.
+bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1,
+ MachineInstr *MI2) {
// If we don't know the predicate sense of the instructions bail out early, we
// need it later.
- if (getPredicateSense(MI1, QII) == PK_Unknown ||
- getPredicateSense(MI2, QII) == PK_Unknown)
+ if (getPredicateSense(MI1, HII) == PK_Unknown ||
+ getPredicateSense(MI2, HII) == PK_Unknown)
return false;
- // Scheduling unit for candidate
- SUnit *SU = MIToSUnit.find(MI1)->second;
+ // Scheduling unit for candidate.
+ SUnit *SU = MIToSUnit[MI1];
// One corner case deals with the following scenario:
// Trying to add
- // a) %R24<def> = TFR_cPt %P0, %R25
+ // a) %R24<def> = A2_tfrt %P0, %R25
// to this packet:
- //
// {
- // b) %R25<def> = TFR_cNotPt %P0, %R24
- // c) %P0<def> = CMPEQri %R26, 1
+ // b) %R25<def> = A2_tfrf %P0, %R24
+ // c) %P0<def> = C2_cmpeqi %R26, 1
// }
//
- // On general check a) and b) are complements, but
- // presence of c) will convert a) to .new form, and
- // then it is not a complement
- // We attempt to detect it by analyzing existing
- // dependencies in the packet
+ // On general check a) and b) are complements, but presence of c) will
+ // convert a) to .new form, and then it is not a complement.
+ // We attempt to detect it by analyzing existing dependencies in the packet.
// Analyze relationships between all existing members of the packet.
- // Look for Anti dependecy on the same predicate reg
- // as used in the candidate
- for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
- VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
-
- // Scheduling Unit for current insn in the packet
- SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
+ // Look for Anti dependecy on the same predicate reg as used in the
+ // candidate.
+ for (auto I : CurrentPacketMIs) {
+ // Scheduling Unit for current insn in the packet.
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
// If this instruction in the packet is succeeded by the candidate...
if (PacketSU->isSucc(SU)) {
for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
- // The corner case exist when there is true data
- // dependency between candidate and one of current
- // packet members, this dep is on predicate reg, and
- // there already exist anti dep on the same pred in
+ auto Dep = PacketSU->Succs[i];
+ // The corner case exist when there is true data dependency between
+ // candidate and one of current packet members, this dep is on
+ // predicate reg, and there already exist anti dep on the same pred in
// the packet.
- if (PacketSU->Succs[i].getSUnit() == SU &&
- PacketSU->Succs[i].getKind() == SDep::Data &&
- Hexagon::PredRegsRegClass.contains(
- PacketSU->Succs[i].getReg()) &&
- // Here I know that *VIN is predicate setting instruction
- // with true data dep to candidate on the register
- // we care about - c) in the above example.
- // Now I need to see if there is an anti dependency
- // from c) to any other instruction in the
- // same packet on the pred reg of interest
- RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
- MIToSUnit)) {
- return false;
+ if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data &&
+ Hexagon::PredRegsRegClass.contains(Dep.getReg())) {
+ // Here I know that I is predicate setting instruction with true
+ // data dep to candidate on the register we care about - c) in the
+ // above example. Now I need to see if there is an anti dependency
+ // from c) to any other instruction in the same packet on the pred
+ // reg of interest.
+ if (restrictingDepExistInPacket(I, Dep.getReg()))
+ return false;
}
}
}
}
- // If the above case does not apply, check regular
- // complement condition.
- // Check that the predicate register is the same and
- // that the predicate sense is different
- // We also need to differentiate .old vs. .new:
- // !p0 is not complimentary to p0.new
- unsigned PReg1 = getPredicatedRegister(MI1, QII);
- unsigned PReg2 = getPredicatedRegister(MI2, QII);
- return ((PReg1 == PReg2) &&
- Hexagon::PredRegsRegClass.contains(PReg1) &&
- Hexagon::PredRegsRegClass.contains(PReg2) &&
- (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) &&
- (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
+ // If the above case does not apply, check regular complement condition.
+ // Check that the predicate register is the same and that the predicate
+ // sense is different We also need to differentiate .old vs. .new: !p0
+ // is not complementary to p0.new.
+ unsigned PReg1 = getPredicatedRegister(MI1, HII);
+ unsigned PReg2 = getPredicatedRegister(MI2, HII);
+ return PReg1 == PReg2 &&
+ Hexagon::PredRegsRegClass.contains(PReg1) &&
+ Hexagon::PredRegsRegClass.contains(PReg2) &&
+ getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) &&
+ HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2);
}
-// initPacketizerState - Initialize packetizer flags
+// Initialize packetizer flags.
void HexagonPacketizerList::initPacketizerState() {
-
Dependence = false;
PromotedToDotNew = false;
GlueToNewValueJump = false;
GlueAllocframeStore = false;
FoundSequentialDependence = false;
-
- return;
}
-// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
- MachineBasicBlock *MBB) {
+// Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr *MI,
+ const MachineBasicBlock*) {
if (MI->isDebugValue())
return true;
if (MI->isCFIInstruction())
return false;
- // We must print out inline assembly
+ // We must print out inline assembly.
if (MI->isInlineAsm())
return false;
- // We check if MI has any functional units mapped to it.
- // If it doesn't, we ignore the instruction.
+ if (MI->isImplicitDef())
+ return false;
+
+ // We check if MI has any functional units mapped to it. If it doesn't,
+ // we ignore the instruction.
const MCInstrDesc& TID = MI->getDesc();
- unsigned SchedClass = TID.getSchedClass();
- const InstrStage* IS =
- ResourceTracker->getInstrItins()->beginStage(SchedClass);
+ auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
unsigned FuncUnits = IS->getUnits();
return !FuncUnits;
}
-// isSoloInstruction: - Returns true for instructions that must be
-// scheduled in their own packet.
-bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) {
if (MI->isEHLabel() || MI->isCFIInstruction())
return true;
- if (MI->isInlineAsm())
+ // Consider inline asm to not be a solo instruction by default.
+ // Inline asm will be put in a packet temporarily, but then it will be
+ // removed, and placed outside of the packet (before or after, depending
+ // on dependencies). This is to reduce the impact of inline asm as a
+ // "packet splitting" instruction.
+ if (MI->isInlineAsm() && !ScheduleInlineAsm)
return true;
// From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
// trap, pause, barrier, icinva, isync, and syncht are solo instructions.
// They must not be grouped with other instructions in a packet.
- if (IsSchedBarrier(MI))
+ if (isSchedBarrier(MI))
+ return true;
+
+ if (HII->isSolo(MI))
+ return true;
+
+ if (MI->getOpcode() == Hexagon::A2_nop)
return true;
return false;
}
-// isLegalToPacketizeTogether:
-// SUI is the current instruction that is out side of the current packet.
-// SUJ is the current instruction inside the current packet against which that
-// SUI will be packetized.
-bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
- MachineInstr *I = SUI->getInstr();
- MachineInstr *J = SUJ->getInstr();
- assert(I && J && "Unable to packetize null instruction!");
- const MCInstrDesc &MCIDI = I->getDesc();
- const MCInstrDesc &MCIDJ = J->getDesc();
+// Quick check if instructions MI and MJ cannot coexist in the same packet.
+// Limit the tests to be "one-way", e.g. "if MI->isBranch and MJ->isInlineAsm",
+// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm".
+// For full test call this function twice:
+// cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI)
+// Doing the test only one way saves the amount of code in this function,
+// since every test would need to be repeated with the MI and MJ reversed.
+static bool cannotCoexistAsymm(const MachineInstr *MI, const MachineInstr *MJ,
+ const HexagonInstrInfo &HII) {
+ const MachineFunction *MF = MI->getParent()->getParent();
+ if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+ HII.isHVXMemWithAIndirect(MI, MJ))
+ return true;
- MachineBasicBlock::iterator II = I;
+ // An inline asm cannot be together with a branch, because we may not be
+ // able to remove the asm out after packetizing (i.e. if the asm must be
+ // moved past the bundle). Similarly, two asms cannot be together to avoid
+ // complications when determining their relative order outside of a bundle.
+ if (MI->isInlineAsm())
+ return MJ->isInlineAsm() || MJ->isBranch() || MJ->isBarrier() ||
+ MJ->isCall() || MJ->isTerminator();
- const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
- const HexagonRegisterInfo *QRI =
- (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+ // "False" really means that the quick check failed to determine if
+ // I and J cannot coexist.
+ return false;
+}
- // Inline asm cannot go in the packet.
- if (I->getOpcode() == Hexagon::INLINEASM)
- llvm_unreachable("Should not meet inline asm here!");
- if (isSoloInstruction(I))
- llvm_unreachable("Should not meet solo instr here!");
+// Full, symmetric check.
+bool HexagonPacketizerList::cannotCoexist(const MachineInstr *MI,
+ const MachineInstr *MJ) {
+ return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII);
+}
- // A save callee-save register function call can only be in a packet
- // with instructions that don't write to the callee-save registers.
- if ((QII->isSaveCalleeSavedRegsCall(I) &&
- DoesModifyCalleeSavedReg(J, QRI)) ||
- (QII->isSaveCalleeSavedRegsCall(J) &&
- DoesModifyCalleeSavedReg(I, QRI))) {
- Dependence = true;
- return false;
+void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
+ for (auto &B : MF) {
+ MachineBasicBlock::iterator BundleIt;
+ MachineBasicBlock::instr_iterator NextI;
+ for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) {
+ NextI = std::next(I);
+ MachineInstr *MI = &*I;
+ if (MI->isBundle())
+ BundleIt = I;
+ if (!MI->isInsideBundle())
+ continue;
+
+ // Decide on where to insert the instruction that we are pulling out.
+ // Debug instructions always go before the bundle, but the placement of
+ // INLINE_ASM depends on potential dependencies. By default, try to
+ // put it before the bundle, but if the asm writes to a register that
+ // other instructions in the bundle read, then we need to place it
+ // after the bundle (to preserve the bundle semantics).
+ bool InsertBeforeBundle;
+ if (MI->isInlineAsm())
+ InsertBeforeBundle = !hasWriteToReadDep(MI, BundleIt, HRI);
+ else if (MI->isDebugValue())
+ InsertBeforeBundle = true;
+ else
+ continue;
+
+ BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle);
+ }
}
+}
- // Two control flow instructions cannot go in the same packet.
- if (IsControlFlow(I) && IsControlFlow(J)) {
- Dependence = true;
- return false;
+// Check if a given instruction is of class "system".
+static bool isSystemInstr(const MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ case Hexagon::Y2_barrier:
+ case Hexagon::Y2_dcfetchbo:
+ return true;
}
+ return false;
+}
- // A LoopN instruction cannot appear in the same packet as a jump or call.
- if (IsLoopN(I) &&
- (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) {
- Dependence = true;
+bool HexagonPacketizerList::hasDeadDependence(const MachineInstr *I,
+ const MachineInstr *J) {
+ // The dependence graph may not include edges between dead definitions,
+ // so without extra checks, we could end up packetizing two instruction
+ // defining the same (dead) register.
+ if (I->isCall() || J->isCall())
return false;
- }
- if (IsLoopN(J) &&
- (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) {
- Dependence = true;
+ if (HII->isPredicated(I) || HII->isPredicated(J))
return false;
+
+ BitVector DeadDefs(Hexagon::NUM_TARGET_REGS);
+ for (auto &MO : I->operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+ continue;
+ DeadDefs[MO.getReg()] = true;
}
+ for (auto &MO : J->operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+ continue;
+ unsigned R = MO.getReg();
+ if (R != Hexagon::USR_OVF && DeadDefs[R])
+ return true;
+ }
+ return false;
+}
+
+bool HexagonPacketizerList::hasControlDependence(const MachineInstr *I,
+ const MachineInstr *J) {
+ // A save callee-save register function call can only be in a packet
+ // with instructions that don't write to the callee-save registers.
+ if ((HII->isSaveCalleeSavedRegsCall(I) &&
+ doesModifyCalleeSavedReg(J, HRI)) ||
+ (HII->isSaveCalleeSavedRegsCall(J) &&
+ doesModifyCalleeSavedReg(I, HRI)))
+ return true;
+
+ // Two control flow instructions cannot go in the same packet.
+ if (isControlFlow(I) && isControlFlow(J))
+ return true;
+
+ // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot
+ // contain a speculative indirect jump,
+ // a new-value compare jump or a dealloc_return.
+ auto isBadForLoopN = [this] (const MachineInstr *MI) -> bool {
+ if (MI->isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI))
+ return true;
+ if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI))
+ return true;
+ return false;
+ };
+
+ if (HII->isLoopN(I) && isBadForLoopN(J))
+ return true;
+ if (HII->isLoopN(J) && isBadForLoopN(I))
+ return true;
+
// dealloc_return cannot appear in the same packet as a conditional or
// unconditional jump.
- if (QII->isDeallocRet(I) &&
- (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) {
- Dependence = true;
- return false;
+ return HII->isDeallocRet(I) &&
+ (J->isBranch() || J->isCall() || J->isBarrier());
+}
+
+bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr *I,
+ const MachineInstr *J) {
+ bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
+ bool StoreI = I->mayStore(), StoreJ = J->mayStore();
+ if ((SysI && StoreJ) || (SysJ && StoreI))
+ return true;
+
+ if (StoreI && StoreJ) {
+ if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I))
+ return true;
+ } else {
+ // A memop cannot be in the same packet with another memop or a store.
+ // Two stores can be together, but here I and J cannot both be stores.
+ bool MopStI = HII->isMemOp(I) || StoreI;
+ bool MopStJ = HII->isMemOp(J) || StoreJ;
+ if (MopStI && MopStJ)
+ return true;
}
+ return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J));
+}
- // V4 allows dual store. But does not allow second store, if the
- // first store is not in SLOT0. New value store, new value jump,
- // dealloc_return and memop always take SLOT0.
- // Arch spec 3.4.4.2
- if (MCIDI.mayStore() && MCIDJ.mayStore() &&
- (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
- Dependence = true;
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+ MachineInstr *I = SUI->getInstr();
+ MachineInstr *J = SUJ->getInstr();
+ assert(I && J && "Unable to packetize null instruction!");
+
+ // Clear IgnoreDepMIs when Packet starts.
+ if (CurrentPacketMIs.size() == 1)
+ IgnoreDepMIs.clear();
+
+ MachineBasicBlock::iterator II = I;
+ const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+ // Solo instructions cannot go in the packet.
+ assert(!isSoloInstruction(I) && "Unexpected solo instr!");
+
+ if (cannotCoexist(I, J))
return false;
- }
- if ((QII->isMemOp(J) && MCIDI.mayStore())
- || (MCIDJ.mayStore() && QII->isMemOp(I))
- || (QII->isMemOp(J) && QII->isMemOp(I))) {
- Dependence = true;
+ Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J);
+ if (Dependence)
return false;
- }
- //if dealloc_return
- if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
- Dependence = true;
+ // V4 allows dual stores. It does not allow second store, if the first
+ // store is not in SLOT0. New value store, new value jump, dealloc_return
+ // and memop always take SLOT0. Arch spec 3.4.4.2.
+ Dependence = hasV4SpecificDependence(I, J);
+ if (Dependence)
return false;
- }
// If an instruction feeds new value jump, glue it.
MachineBasicBlock::iterator NextMII = I;
++NextMII;
- if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
+ if (NextMII != I->getParent()->end() && HII->isNewValueJump(NextMII)) {
MachineInstr *NextMI = NextMII;
bool secondRegMatch = false;
- bool maintainNewValueJump = false;
+ const MachineOperand &NOp0 = NextMI->getOperand(0);
+ const MachineOperand &NOp1 = NextMI->getOperand(1);
- if (NextMI->getOperand(1).isReg() &&
- I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+ if (NOp1.isReg() && I->getOperand(0).getReg() == NOp1.getReg())
secondRegMatch = true;
- maintainNewValueJump = true;
- }
-
- if (!secondRegMatch &&
- I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
- maintainNewValueJump = true;
- }
- for (std::vector<MachineInstr*>::iterator
- VI = CurrentPacketMIs.begin(),
- VE = CurrentPacketMIs.end();
- (VI != VE && maintainNewValueJump); ++VI) {
- SUnit *PacketSU = MIToSUnit.find(*VI)->second;
-
- // NVJ can not be part of the dual jump - Arch Spec: section 7.8
- if (PacketSU->getInstr()->getDesc().isCall()) {
+ for (auto I : CurrentPacketMIs) {
+ SUnit *PacketSU = MIToSUnit.find(I)->second;
+ MachineInstr *PI = PacketSU->getInstr();
+ // NVJ can not be part of the dual jump - Arch Spec: section 7.8.
+ if (PI->isCall()) {
Dependence = true;
break;
}
- // Validate
+ // Validate:
// 1. Packet does not have a store in it.
// 2. If the first operand of the nvj is newified, and the second
// operand is also a reg, it (second reg) is not defined in
@@ -1108,302 +1184,413 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
// 3. If the second operand of the nvj is newified, (which means
// first operand is also a reg), first reg is not defined in
// the same packet.
- if (PacketSU->getInstr()->getDesc().mayStore() ||
- PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe ||
- // Check #2.
- (!secondRegMatch && NextMI->getOperand(1).isReg() &&
- PacketSU->getInstr()->modifiesRegister(
- NextMI->getOperand(1).getReg(), QRI)) ||
- // Check #3.
- (secondRegMatch &&
- PacketSU->getInstr()->modifiesRegister(
- NextMI->getOperand(0).getReg(), QRI))) {
+ if (PI->getOpcode() == Hexagon::S2_allocframe || PI->mayStore() ||
+ HII->isLoopN(PI)) {
+ Dependence = true;
+ break;
+ }
+ // Check #2/#3.
+ const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1;
+ if (OpR.isReg() && PI->modifiesRegister(OpR.getReg(), HRI)) {
Dependence = true;
break;
}
}
- if (!Dependence)
- GlueToNewValueJump = true;
- else
+
+ if (Dependence)
return false;
+ GlueToNewValueJump = true;
}
- if (SUJ->isSucc(SUI)) {
- for (unsigned i = 0;
- (i < SUJ->Succs.size()) && !FoundSequentialDependence;
- ++i) {
+ // There no dependency between a prolog instruction and its successor.
+ if (!SUJ->isSucc(SUI))
+ return true;
- if (SUJ->Succs[i].getSUnit() != SUI) {
- continue;
- }
+ for (unsigned i = 0; i < SUJ->Succs.size(); ++i) {
+ if (FoundSequentialDependence)
+ break;
- SDep::Kind DepType = SUJ->Succs[i].getKind();
+ if (SUJ->Succs[i].getSUnit() != SUI)
+ continue;
- // For direct calls:
- // Ignore register dependences for call instructions for
- // packetization purposes except for those due to r31 and
- // predicate registers.
- //
- // For indirect calls:
- // Same as direct calls + check for true dependences to the register
- // used in the indirect call.
- //
- // We completely ignore Order dependences for call instructions
- //
- // For returns:
- // Ignore register dependences for return instructions like jumpr,
- // dealloc return unless we have dependencies on the explicit uses
- // of the registers used by jumpr (like r31) or dealloc return
- // (like r29 or r30).
- //
- // TODO: Currently, jumpr is handling only return of r31. So, the
- // following logic (specificaly IsCallDependent) is working fine.
- // We need to enable jumpr for register other than r31 and then,
- // we need to rework the last part, where it handles indirect call
- // of that (IsCallDependent) function. Bug 6216 is opened for this.
- //
- unsigned DepReg = 0;
- const TargetRegisterClass* RC = nullptr;
- if (DepType == SDep::Data) {
- DepReg = SUJ->Succs[i].getReg();
- RC = QRI->getMinimalPhysRegClass(DepReg);
- }
- if ((MCIDI.isCall() || MCIDI.isReturn()) &&
- (!IsRegDependence(DepType) ||
- !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
- /* do nothing */
- }
+ SDep::Kind DepType = SUJ->Succs[i].getKind();
+ // For direct calls:
+ // Ignore register dependences for call instructions for packetization
+ // purposes except for those due to r31 and predicate registers.
+ //
+ // For indirect calls:
+ // Same as direct calls + check for true dependences to the register
+ // used in the indirect call.
+ //
+ // We completely ignore Order dependences for call instructions.
+ //
+ // For returns:
+ // Ignore register dependences for return instructions like jumpr,
+ // dealloc return unless we have dependencies on the explicit uses
+ // of the registers used by jumpr (like r31) or dealloc return
+ // (like r29 or r30).
+ //
+ // TODO: Currently, jumpr is handling only return of r31. So, the
+ // following logic (specificaly isCallDependent) is working fine.
+ // We need to enable jumpr for register other than r31 and then,
+ // we need to rework the last part, where it handles indirect call
+ // of that (isCallDependent) function. Bug 6216 is opened for this.
+ unsigned DepReg = 0;
+ const TargetRegisterClass *RC = nullptr;
+ if (DepType == SDep::Data) {
+ DepReg = SUJ->Succs[i].getReg();
+ RC = HRI->getMinimalPhysRegClass(DepReg);
+ }
- // For instructions that can be promoted to dot-new, try to promote.
- else if ((DepType == SDep::Data) &&
- CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
- PromoteToDotNew(I, DepType, II, RC)) {
- PromotedToDotNew = true;
- /* do nothing */
- }
+ if (I->isCall() || I->isReturn()) {
+ if (!isRegDependence(DepType))
+ continue;
+ if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg()))
+ continue;
+ }
- else if ((DepType == SDep::Data) &&
- (QII->isNewValueJump(I))) {
- /* do nothing */
- }
+ if (DepType == SDep::Data) {
+ if (canPromoteToDotCur(J, SUJ, DepReg, II, RC))
+ if (promoteToDotCur(J, DepType, II, RC))
+ continue;
+ }
- // For predicated instructions, if the predicates are complements
- // then there can be no dependence.
- else if (QII->isPredicated(I) &&
- QII->isPredicated(J) &&
- ArePredicatesComplements(I, J, MIToSUnit)) {
- /* do nothing */
+ // Data dpendence ok if we have load.cur.
+ if (DepType == SDep::Data && HII->isDotCurInst(J)) {
+ if (HII->isV60VectorInstruction(I))
+ continue;
+ }
+ // For instructions that can be promoted to dot-new, try to promote.
+ if (DepType == SDep::Data) {
+ if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) {
+ if (promoteToDotNew(I, DepType, II, RC)) {
+ PromotedToDotNew = true;
+ continue;
+ }
}
- else if (IsDirectJump(I) &&
- !MCIDJ.isBranch() &&
- !MCIDJ.isCall() &&
- (DepType == SDep::Order)) {
- // Ignore Order dependences between unconditional direct branches
- // and non-control-flow instructions
- /* do nothing */
- }
- else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
- (DepType != SDep::Output)) {
- // Ignore all dependences for jumps except for true and output
- // dependences
- /* do nothing */
- }
-
- // Ignore output dependences due to superregs. We can
- // write to two different subregisters of R1:0 for instance
- // in the same cycle
- //
+ if (HII->isNewValueJump(I))
+ continue;
+ }
+ // For predicated instructions, if the predicates are complements then
+ // there can be no dependence.
+ if (HII->isPredicated(I) && HII->isPredicated(J) &&
+ arePredicatesComplements(I, J)) {
+ // Not always safe to do this translation.
+ // DAG Builder attempts to reduce dependence edges using transitive
+ // nature of dependencies. Here is an example:
//
- // Let the
- // If neither I nor J defines DepReg, then this is a
- // superfluous output dependence. The dependence must be of the
- // form:
- // R0 = ...
- // R1 = ...
- // and there is an output dependence between the two instructions
- // with
- // DepReg = D0
- // We want to ignore these dependences.
- // Ideally, the dependence constructor should annotate such
- // dependences. We can then avoid this relatively expensive check.
+ // r0 = tfr_pt ... (1)
+ // r0 = tfr_pf ... (2)
+ // r0 = tfr_pt ... (3)
//
- else if (DepType == SDep::Output) {
- // DepReg is the register that's responsible for the dependence.
- unsigned DepReg = SUJ->Succs[i].getReg();
+ // There will be an output dependence between (1)->(2) and (2)->(3).
+ // However, there is no dependence edge between (1)->(3). This results
+ // in all 3 instructions going in the same packet. We ignore dependce
+ // only once to avoid this situation.
+ auto Itr = std::find(IgnoreDepMIs.begin(), IgnoreDepMIs.end(), J);
+ if (Itr != IgnoreDepMIs.end()) {
+ Dependence = true;
+ return false;
+ }
+ IgnoreDepMIs.push_back(I);
+ continue;
+ }
+
+ // Ignore Order dependences between unconditional direct branches
+ // and non-control-flow instructions.
+ if (isDirectJump(I) && !J->isBranch() && !J->isCall() &&
+ DepType == SDep::Order)
+ continue;
+
+ // Ignore all dependences for jumps except for true and output
+ // dependences.
+ if (I->isConditionalBranch() && DepType != SDep::Data &&
+ DepType != SDep::Output)
+ continue;
+
+ // Ignore output dependences due to superregs. We can write to two
+ // different subregisters of R1:0 for instance in the same cycle.
+
+ // If neither I nor J defines DepReg, then this is a superfluous output
+ // dependence. The dependence must be of the form:
+ // R0 = ...
+ // R1 = ...
+ // and there is an output dependence between the two instructions with
+ // DepReg = D0.
+ // We want to ignore these dependences. Ideally, the dependence
+ // constructor should annotate such dependences. We can then avoid this
+ // relatively expensive check.
+ //
+ if (DepType == SDep::Output) {
+ // DepReg is the register that's responsible for the dependence.
+ unsigned DepReg = SUJ->Succs[i].getReg();
+
+ // Check if I and J really defines DepReg.
+ if (!I->definesRegister(DepReg) && !J->definesRegister(DepReg))
+ continue;
+ FoundSequentialDependence = true;
+ break;
+ }
- // Check if I and J really defines DepReg.
- if (I->definesRegister(DepReg) ||
- J->definesRegister(DepReg)) {
+ // For Order dependences:
+ // 1. On V4 or later, volatile loads/stores can be packetized together,
+ // unless other rules prevent is.
+ // 2. Store followed by a load is not allowed.
+ // 3. Store followed by a store is only valid on V4 or later.
+ // 4. Load followed by any memory operation is allowed.
+ if (DepType == SDep::Order) {
+ if (!PacketizeVolatiles) {
+ bool OrdRefs = I->hasOrderedMemoryRef() || J->hasOrderedMemoryRef();
+ if (OrdRefs) {
FoundSequentialDependence = true;
break;
}
}
-
- // We ignore Order dependences for
- // 1. Two loads unless they are volatile.
- // 2. Two stores in V4 unless they are volatile.
- else if ((DepType == SDep::Order) &&
- !I->hasOrderedMemoryRef() &&
- !J->hasOrderedMemoryRef()) {
- if (MCIDI.mayStore() && MCIDJ.mayStore()) {
- /* do nothing */
- }
- // store followed by store-- not OK on V2
- // store followed by load -- not OK on all (OK if addresses
- // are not aliased)
- // load followed by store -- OK on all
- // load followed by load -- OK on all
- else if ( !MCIDJ.mayStore()) {
- /* do nothing */
- }
- else {
+ // J is first, I is second.
+ bool LoadJ = J->mayLoad(), StoreJ = J->mayStore();
+ bool LoadI = I->mayLoad(), StoreI = I->mayStore();
+ if (StoreJ) {
+ // Two stores are only allowed on V4+. Load following store is never
+ // allowed.
+ if (LoadI) {
FoundSequentialDependence = true;
break;
}
- }
-
- // For V4, special case ALLOCFRAME. Even though there is dependency
- // between ALLOCFRAME and subsequent store, allow it to be
- // packetized in a same packet. This implies that the store is using
- // caller's SP. Hence, offset needs to be updated accordingly.
- else if (DepType == SDep::Data
- && J->getOpcode() == Hexagon::S2_allocframe
- && (I->getOpcode() == Hexagon::S2_storerd_io
- || I->getOpcode() == Hexagon::S2_storeri_io
- || I->getOpcode() == Hexagon::S2_storerb_io)
- && I->getOperand(0).getReg() == QRI->getStackRegister()
- && QII->isValidOffset(I->getOpcode(),
- I->getOperand(1).getImm() -
- (FrameSize + HEXAGON_LRFP_SIZE)))
- {
- GlueAllocframeStore = true;
- // Since this store is to be glued with allocframe in the same
- // packet, it will use SP of the previous stack frame, i.e
- // caller's SP. Therefore, we need to recalculate offset according
- // to this change.
- I->getOperand(1).setImm(I->getOperand(1).getImm() -
- (FrameSize + HEXAGON_LRFP_SIZE));
- }
-
- //
- // Skip over anti-dependences. Two instructions that are
- // anti-dependent can share a packet
- //
- else if (DepType != SDep::Anti) {
+ } else if (!LoadJ || (!LoadI && !StoreI)) {
+ // If J is neither load nor store, assume a dependency.
+ // If J is a load, but I is neither, also assume a dependency.
FoundSequentialDependence = true;
break;
}
+ // Store followed by store: not OK on V2.
+ // Store followed by load: not OK on all.
+ // Load followed by store: OK on all.
+ // Load followed by load: OK on all.
+ continue;
}
- if (FoundSequentialDependence) {
- Dependence = true;
- return false;
+ // For V4, special case ALLOCFRAME. Even though there is dependency
+ // between ALLOCFRAME and subsequent store, allow it to be packetized
+ // in a same packet. This implies that the store is using the caller's
+ // SP. Hence, offset needs to be updated accordingly.
+ if (DepType == SDep::Data && J->getOpcode() == Hexagon::S2_allocframe) {
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Hexagon::S2_storerd_io:
+ case Hexagon::S2_storeri_io:
+ case Hexagon::S2_storerh_io:
+ case Hexagon::S2_storerb_io:
+ if (I->getOperand(0).getReg() == HRI->getStackRegister()) {
+ int64_t Imm = I->getOperand(1).getImm();
+ int64_t NewOff = Imm - (FrameSize + HEXAGON_LRFP_SIZE);
+ if (HII->isValidOffset(Opc, NewOff)) {
+ GlueAllocframeStore = true;
+ // Since this store is to be glued with allocframe in the same
+ // packet, it will use SP of the previous stack frame, i.e.
+ // caller's SP. Therefore, we need to recalculate offset
+ // according to this change.
+ I->getOperand(1).setImm(NewOff);
+ continue;
+ }
+ }
+ default:
+ break;
+ }
+ }
+
+ // Skip over anti-dependences. Two instructions that are anti-dependent
+ // can share a packet.
+ if (DepType != SDep::Anti) {
+ FoundSequentialDependence = true;
+ break;
}
}
+ if (FoundSequentialDependence) {
+ Dependence = true;
+ return false;
+ }
+
return true;
}
-// isLegalToPruneDependencies
bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
MachineInstr *I = SUI->getInstr();
- assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
-
- const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
-
- if (Dependence) {
+ MachineInstr *J = SUJ->getInstr();
+ assert(I && J && "Unable to packetize null instruction!");
- // Check if the instruction was promoted to a dot-new. If so, demote it
- // back into a dot-old.
- if (PromotedToDotNew) {
- DemoteToDotOld(I);
- }
+ if (cannotCoexist(I, J))
+ return false;
- // Check if the instruction (must be a store) was glued with an Allocframe
- // instruction. If so, restore its offset to its original value, i.e. use
- // curent SP instead of caller's SP.
- if (GlueAllocframeStore) {
- I->getOperand(1).setImm(I->getOperand(1).getImm() +
- FrameSize + HEXAGON_LRFP_SIZE);
- }
+ if (!Dependence)
+ return true;
- return false;
+ // Check if the instruction was promoted to a dot-new. If so, demote it
+ // back into a dot-old.
+ if (PromotedToDotNew)
+ demoteToDotOld(I);
+
+ cleanUpDotCur();
+ // Check if the instruction (must be a store) was glued with an allocframe
+ // instruction. If so, restore its offset to its original value, i.e. use
+ // current SP instead of caller's SP.
+ if (GlueAllocframeStore) {
+ unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+ MachineOperand &MOff = I->getOperand(1);
+ MOff.setImm(MOff.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
}
- return true;
+ return false;
}
+
MachineBasicBlock::iterator
HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+ MachineBasicBlock::iterator MII = MI;
+ MachineBasicBlock *MBB = MI->getParent();
+ if (MI->isImplicitDef()) {
+ unsigned R = MI->getOperand(0).getReg();
+ if (Hexagon::IntRegsRegClass.contains(R)) {
+ MCSuperRegIterator S(R, HRI, false);
+ MI->addOperand(MachineOperand::CreateReg(*S, true, true));
+ }
+ return MII;
+ }
+ assert(ResourceTracker->canReserveResources(MI));
+
+ bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
+ bool Good = true;
+
+ if (GlueToNewValueJump) {
+ MachineInstr *NvjMI = ++MII;
+ // We need to put both instructions in the same packet: MI and NvjMI.
+ // Either of them can require a constant extender. Try to add both to
+ // the current packet, and if that fails, end the packet and start a
+ // new one.
+ ResourceTracker->reserveResources(MI);
+ if (ExtMI)
+ Good = tryAllocateResourcesForConstExt(true);
+
+ bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI);
+ if (Good) {
+ if (ResourceTracker->canReserveResources(NvjMI))
+ ResourceTracker->reserveResources(NvjMI);
+ else
+ Good = false;
+ }
+ if (Good && ExtNvjMI)
+ Good = tryAllocateResourcesForConstExt(true);
- MachineBasicBlock::iterator MII = MI;
- MachineBasicBlock *MBB = MI->getParent();
-
- const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-
- if (GlueToNewValueJump) {
-
- ++MII;
- MachineInstr *nvjMI = MII;
+ if (!Good) {
+ endPacket(MBB, MI);
assert(ResourceTracker->canReserveResources(MI));
ResourceTracker->reserveResources(MI);
- if ((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
- !tryAllocateResourcesForConstExt(MI)) {
- endPacket(MBB, MI);
- ResourceTracker->reserveResources(MI);
- assert(canReserveResourcesForConstExt(MI) &&
- "Ensure that there is a slot");
- reserveResourcesForConstExt(MI);
- // Reserve resources for new value jump constant extender.
- assert(canReserveResourcesForConstExt(MI) &&
- "Ensure that there is a slot");
- reserveResourcesForConstExt(nvjMI);
- assert(ResourceTracker->canReserveResources(nvjMI) &&
- "Ensure that there is a slot");
-
- } else if ( // Extended instruction takes two slots in the packet.
- // Try reserve and allocate 4-byte in the current packet first.
- (QII->isExtended(nvjMI)
- && (!tryAllocateResourcesForConstExt(nvjMI)
- || !ResourceTracker->canReserveResources(nvjMI)))
- || // For non-extended instruction, no need to allocate extra 4 bytes.
- (!QII->isExtended(nvjMI) &&
- !ResourceTracker->canReserveResources(nvjMI)))
- {
- endPacket(MBB, MI);
- // A new and empty packet starts.
- // We are sure that the resources requirements can be satisfied.
- // Therefore, do not need to call "canReserveResources" anymore.
- ResourceTracker->reserveResources(MI);
- if (QII->isExtended(nvjMI))
- reserveResourcesForConstExt(nvjMI);
+ if (ExtMI) {
+ assert(canReserveResourcesForConstExt());
+ tryAllocateResourcesForConstExt(true);
}
- // Here, we are sure that "reserveResources" would succeed.
- ResourceTracker->reserveResources(nvjMI);
- CurrentPacketMIs.push_back(MI);
- CurrentPacketMIs.push_back(nvjMI);
- } else {
- if ( (QII->isExtended(MI) || QII->isConstExtended(MI))
- && ( !tryAllocateResourcesForConstExt(MI)
- || !ResourceTracker->canReserveResources(MI)))
- {
- endPacket(MBB, MI);
- // Check if the instruction was promoted to a dot-new. If so, demote it
- // back into a dot-old
- if (PromotedToDotNew) {
- DemoteToDotOld(MI);
- }
- reserveResourcesForConstExt(MI);
+ assert(ResourceTracker->canReserveResources(NvjMI));
+ ResourceTracker->reserveResources(NvjMI);
+ if (ExtNvjMI) {
+ assert(canReserveResourcesForConstExt());
+ reserveResourcesForConstExt();
}
- // In case that "MI" is not an extended insn,
- // the resource availability has already been checked.
- ResourceTracker->reserveResources(MI);
- CurrentPacketMIs.push_back(MI);
}
+ CurrentPacketMIs.push_back(MI);
+ CurrentPacketMIs.push_back(NvjMI);
return MII;
+ }
+
+ ResourceTracker->reserveResources(MI);
+ if (ExtMI && !tryAllocateResourcesForConstExt(true)) {
+ endPacket(MBB, MI);
+ if (PromotedToDotNew)
+ demoteToDotOld(MI);
+ ResourceTracker->reserveResources(MI);
+ reserveResourcesForConstExt();
+ }
+
+ CurrentPacketMIs.push_back(MI);
+ return MII;
+}
+
+void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
+ MachineInstr *MI) {
+ OldPacketMIs = CurrentPacketMIs;
+ VLIWPacketizerList::endPacket(MBB, MI);
+}
+
+bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr *MI) {
+ return !producesStall(MI);
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+static bool isDependent(const MachineInstr *ProdMI,
+ const MachineInstr *ConsMI) {
+ if (!ProdMI->getOperand(0).isReg())
+ return false;
+ unsigned DstReg = ProdMI->getOperand(0).getReg();
+
+ for (auto &Op : ConsMI->operands())
+ if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg)
+ // The MIs depend on each other.
+ return true;
+
+ return false;
}
+// V60 forward scheduling.
+bool HexagonPacketizerList::producesStall(const MachineInstr *I) {
+ // Check whether the previous packet is in a different loop. If this is the
+ // case, there is little point in trying to avoid a stall because that would
+ // favor the rare case (loop entry) over the common case (loop iteration).
+ //
+ // TODO: We should really be able to check all the incoming edges if this is
+ // the first packet in a basic block, so we can avoid stalls from the loop
+ // backedge.
+ if (!OldPacketMIs.empty()) {
+ auto *OldBB = OldPacketMIs.front()->getParent();
+ auto *ThisBB = I->getParent();
+ if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
+ return false;
+ }
+
+ // Check for stall between two vector instructions.
+ if (HII->isV60VectorInstruction(I)) {
+ for (auto J : OldPacketMIs) {
+ if (!HII->isV60VectorInstruction(J))
+ continue;
+ if (isDependent(J, I) && !HII->isVecUsableNextPacket(J, I))
+ return true;
+ }
+ return false;
+ }
+
+ // Check for stall between two scalar instructions. First, check that
+ // there is no definition of a use in the current packet, because it
+ // may be a candidate for .new.
+ for (auto J : CurrentPacketMIs)
+ if (!HII->isV60VectorInstruction(J) && isDependent(J, I))
+ return false;
+
+ // Check for stall between I and instructions in the previous packet.
+ if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) {
+ for (auto J : OldPacketMIs) {
+ if (HII->isV60VectorInstruction(J))
+ continue;
+ if (!HII->isLateInstrFeedsEarlyInstr(J, I))
+ continue;
+ if (isDependent(J, I) && !HII->canExecuteInBundle(J, I))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
//===----------------------------------------------------------------------===//
// Public Constructor Functions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
new file mode 100644
index 000000000000..960cf6ca5bbc
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -0,0 +1,114 @@
+#ifndef HEXAGONVLIWPACKETIZER_H
+#define HEXAGONVLIWPACKETIZER_H
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+namespace llvm {
+class HexagonPacketizerList : public VLIWPacketizerList {
+ // Vector of instructions assigned to the packet that has just been created.
+ std::vector<MachineInstr*> OldPacketMIs;
+
+ // Has the instruction been promoted to a dot-new instruction.
+ bool PromotedToDotNew;
+
+ // Has the instruction been glued to allocframe.
+ bool GlueAllocframeStore;
+
+ // Has the feeder instruction been glued to new value jump.
+ bool GlueToNewValueJump;
+
+ // Check if there is a dependence between some instruction already in this
+ // packet and this instruction.
+ bool Dependence;
+
+ // Only check for dependence if there are resources available to
+ // schedule this instruction.
+ bool FoundSequentialDependence;
+
+ // Track MIs with ignored dependence.
+ std::vector<MachineInstr*> IgnoreDepMIs;
+
+protected:
+ /// \brief A handle to the branch probability pass.
+ const MachineBranchProbabilityInfo *MBPI;
+ const MachineLoopInfo *MLI;
+
+private:
+ const HexagonInstrInfo *HII;
+ const HexagonRegisterInfo *HRI;
+
+public:
+ // Ctor.
+ HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+ AliasAnalysis *AA,
+ const MachineBranchProbabilityInfo *MBPI);
+
+ // initPacketizerState - initialize some internal flags.
+ void initPacketizerState() override;
+
+ // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+ bool ignorePseudoInstruction(const MachineInstr *MI,
+ const MachineBasicBlock *MBB) override;
+
+ // isSoloInstruction - return true if instruction MI can not be packetized
+ // with any other instruction, which means that MI itself is a packet.
+ bool isSoloInstruction(const MachineInstr *MI) override;
+
+ // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+ // together.
+ bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
+
+ // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+ // and SUJ.
+ bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
+
+ MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
+ void endPacket(MachineBasicBlock *MBB, MachineInstr *MI) override;
+ bool shouldAddToPacket(const MachineInstr *MI) override;
+
+ void unpacketizeSoloInstrs(MachineFunction &MF);
+
+protected:
+ bool isCallDependent(const MachineInstr* MI, SDep::Kind DepType,
+ unsigned DepReg);
+ bool promoteToDotCur(MachineInstr* MI, SDep::Kind DepType,
+ MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC);
+ bool canPromoteToDotCur(const MachineInstr* MI, const SUnit* PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC);
+ void cleanUpDotCur();
+
+ bool promoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+ MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC);
+ bool canPromoteToDotNew(const MachineInstr* MI, const SUnit* PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII,
+ const TargetRegisterClass* RC);
+ bool canPromoteToNewValue(const MachineInstr* MI, const SUnit* PacketSU,
+ unsigned DepReg, MachineBasicBlock::iterator &MII);
+ bool canPromoteToNewValueStore(const MachineInstr* MI,
+ const MachineInstr* PacketMI, unsigned DepReg);
+ bool demoteToDotOld(MachineInstr* MI);
+ bool arePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2);
+ bool restrictingDepExistInPacket(MachineInstr*, unsigned);
+ bool isNewifiable(const MachineInstr *MI);
+ bool isCurifiable(MachineInstr* MI);
+ bool cannotCoexist(const MachineInstr *MI, const MachineInstr *MJ);
+ inline bool isPromotedToDotNew() const {
+ return PromotedToDotNew;
+ }
+ bool tryAllocateResourcesForConstExt(bool Reserve);
+ bool canReserveResourcesForConstExt();
+ void reserveResourcesForConstExt();
+ bool hasDeadDependence(const MachineInstr *I, const MachineInstr *J);
+ bool hasControlDependence(const MachineInstr *I, const MachineInstr *J);
+ bool hasV4SpecificDependence(const MachineInstr *I, const MachineInstr *J);
+ bool producesStall(const MachineInstr *MI);
+};
+} // namespace llvm
+#endif // HEXAGONVLIWPACKETIZER_H
+
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index 9d288af0214a..4088cedafd84 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -16,7 +16,7 @@
;===------------------------------------------------------------------------===;
[common]
-subdirectories = Disassembler MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler MCTargetDesc TargetInfo
[component_0]
type = TargetGroup
@@ -33,6 +33,7 @@ required_libraries =
AsmPrinter
CodeGen
Core
+ HexagonAsmParser
HexagonDesc
HexagonInfo
MC
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index 5403b106cbbe..2c5d0dab2848 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -3,10 +3,12 @@ add_llvm_library(LLVMHexagonDesc
HexagonELFObjectWriter.cpp
HexagonInstPrinter.cpp
HexagonMCAsmInfo.cpp
+ HexagonMCChecker.cpp
HexagonMCCodeEmitter.cpp
HexagonMCCompound.cpp
HexagonMCDuplexInfo.cpp
HexagonMCELFStreamer.cpp
+ HexagonMCExpr.cpp
HexagonMCInstrInfo.cpp
HexagonMCShuffler.cpp
HexagonMCTargetDesc.cpp
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 99ea2fabf867..b73af8249cb5 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -13,7 +13,9 @@
#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -33,14 +35,28 @@ class HexagonAsmBackend : public MCAsmBackend {
mutable uint64_t relaxedCnt;
std::unique_ptr <MCInstrInfo> MCII;
std::unique_ptr <MCInst *> RelaxTarget;
+ MCInst * Extender;
public:
HexagonAsmBackend(Target const &T, uint8_t OSABI, StringRef CPU) :
- OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *){}
+ OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+ Extender(nullptr) {}
MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
return createHexagonELFObjectWriter(OS, OSABI, CPU);
}
+ void setExtender(MCContext &Context) const {
+ if (Extender == nullptr)
+ const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+ }
+
+ MCInst *takeExtender() const {
+ assert(Extender != nullptr);
+ MCInst * Result = Extender;
+ const_cast<HexagonAsmBackend *>(this)->Extender = nullptr;
+ return Result;
+ }
+
unsigned getNumFixupKinds() const override {
return Hexagon::NumTargetFixupKinds;
}
@@ -222,6 +238,7 @@ public:
if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
++relaxedCnt;
*RelaxTarget = &MCI;
+ setExtender(Layout.getAssembler().getContext());
return true;
} else {
return false;
@@ -262,6 +279,7 @@ public:
if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
++relaxedCnt;
*RelaxTarget = &MCI;
+ setExtender(Layout.getAssembler().getContext());
return true;
}
}
@@ -276,9 +294,35 @@ public:
llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
}
- void relaxInstruction(MCInst const & /*Inst*/,
- MCInst & /*Res*/) const override {
- llvm_unreachable("relaxInstruction() unimplemented");
+ void relaxInstruction(MCInst const & Inst,
+ MCInst & Res) const override {
+ assert(HexagonMCInstrInfo::isBundle(Inst) &&
+ "Hexagon relaxInstruction only works on bundles");
+
+ Res = HexagonMCInstrInfo::createBundle();
+ // Copy the results into the bundle.
+ bool Update = false;
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+ MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst());
+
+ // if immediate extender needed, add it in
+ if (*RelaxTarget == &CrntHMI) {
+ Update = true;
+ assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) &&
+ "No room to insert extender for relaxation");
+
+ MCInst *HMIx = takeExtender();
+ *HMIx = HexagonMCInstrInfo::deriveExtender(
+ *MCII, CrntHMI,
+ HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI));
+ Res.addOperand(MCOperand::createInst(HMIx));
+ *RelaxTarget = nullptr;
+ }
+ // now copy over the original instruction(the one we may have extended)
+ Res.addOperand(MCOperand::createInst(I.getInst()));
+ }
+ (void)Update;
+ assert(Update && "Didn't find relaxation target");
}
bool writeNopData(uint64_t Count,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index f4d162ccf6a8..47a6f8636276 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -44,6 +44,25 @@ namespace HexagonII {
TypeMEMOP = 9,
TypeNV = 10,
TypeDUPLEX = 11,
+ TypeCOMPOUND = 12,
+ TypeCVI_FIRST = 13,
+ TypeCVI_VA = TypeCVI_FIRST,
+ TypeCVI_VA_DV = 14,
+ TypeCVI_VX = 15,
+ TypeCVI_VX_DV = 16,
+ TypeCVI_VP = 17,
+ TypeCVI_VP_VS = 18,
+ TypeCVI_VS = 19,
+ TypeCVI_VINLANESAT= 20,
+ TypeCVI_VM_LD = 21,
+ TypeCVI_VM_TMP_LD = 22,
+ TypeCVI_VM_CUR_LD = 23,
+ TypeCVI_VM_VP_LDU = 24,
+ TypeCVI_VM_ST = 25,
+ TypeCVI_VM_NEW_ST = 26,
+ TypeCVI_VM_STU = 27,
+ TypeCVI_HIST = 28,
+ TypeCVI_LAST = TypeCVI_HIST,
TypePREFIX = 30, // Such as extenders.
TypeENDLOOP = 31 // Such as end of a HW loop.
};
@@ -71,12 +90,16 @@ namespace HexagonII {
PostInc = 6 // Post increment addressing mode
};
+ // MemAccessSize is represented as 1+log2(N) where N is size in bits.
enum class MemAccessSize {
NoMemAccess = 0, // Not a memory acces instruction.
ByteAccess = 1, // Byte access instruction (memb).
HalfWordAccess = 2, // Half word access instruction (memh).
WordAccess = 3, // Word access instruction (memw).
- DoubleWordAccess = 4 // Double word access instruction (memd)
+ DoubleWordAccess = 4, // Double word access instruction (memd)
+ // 5, // We do not have a 16 byte vector access.
+ Vector64Access = 7, // 64 Byte vector access instruction (vmem).
+ Vector128Access = 8 // 128 Byte vector access instruction (vmem).
};
// MCInstrDesc TSFlags
@@ -156,7 +179,7 @@ namespace HexagonII {
AddrModeMask = 0x7,
// Access size for load/store instructions.
MemAccessSizePos = 43,
- MemAccesSizeMask = 0x7,
+ MemAccesSizeMask = 0xf,
// Branch predicted taken.
TakenPos = 47,
@@ -164,7 +187,23 @@ namespace HexagonII {
// Floating-point instructions.
FPPos = 48,
- FPMask = 0x1
+ FPMask = 0x1,
+
+ // New-Value producer-2 instructions.
+ hasNewValuePos2 = 50,
+ hasNewValueMask2 = 0x1,
+
+ // Which operand consumes or produces a new value.
+ NewValueOpPos2 = 51,
+ NewValueOpMask2 = 0x7,
+
+ // Accumulator instructions.
+ AccumulatorPos = 54,
+ AccumulatorMask = 0x1,
+
+ // Complex XU, prevent xu competition by prefering slot3
+ PrefersSlot3Pos = 55,
+ PrefersSlot3Mask = 0x1,
};
// *** The code above must match HexagonInstrFormat*.td *** //
@@ -219,6 +258,26 @@ namespace HexagonII {
INST_PARSE_EXTENDER = 0x00000000
};
+ enum InstIClassBits : unsigned {
+ INST_ICLASS_MASK = 0xf0000000,
+ INST_ICLASS_EXTENDER = 0x00000000,
+ INST_ICLASS_J_1 = 0x10000000,
+ INST_ICLASS_J_2 = 0x20000000,
+ INST_ICLASS_LD_ST_1 = 0x30000000,
+ INST_ICLASS_LD_ST_2 = 0x40000000,
+ INST_ICLASS_J_3 = 0x50000000,
+ INST_ICLASS_CR = 0x60000000,
+ INST_ICLASS_ALU32_1 = 0x70000000,
+ INST_ICLASS_XTYPE_1 = 0x80000000,
+ INST_ICLASS_LD = 0x90000000,
+ INST_ICLASS_ST = 0xa0000000,
+ INST_ICLASS_ALU32_2 = 0xb0000000,
+ INST_ICLASS_XTYPE_2 = 0xc0000000,
+ INST_ICLASS_XTYPE_3 = 0xd0000000,
+ INST_ICLASS_XTYPE_4 = 0xe0000000,
+ INST_ICLASS_ALU32_3 = 0xf0000000
+ };
+
} // End namespace HexagonII.
} // End namespace llvm.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 36f81465eef6..06ccec532211 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -12,13 +12,13 @@
//===----------------------------------------------------------------------===//
#include "HexagonAsmPrinter.h"
-#include "Hexagon.h"
#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
#include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "llvm/ADT/StringExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -28,104 +28,33 @@ using namespace llvm;
#define GET_INSTRUCTION_NAME
#include "HexagonGenAsmWriter.inc"
-HexagonAsmInstPrinter::HexagonAsmInstPrinter(MCInstPrinter *RawPrinter)
- : MCInstPrinter(*RawPrinter), RawPrinter(RawPrinter) {}
-
-void HexagonAsmInstPrinter::printInst(MCInst const *MI, raw_ostream &O,
- StringRef Annot,
- MCSubtargetInfo const &STI) {
- assert(HexagonMCInstrInfo::isBundle(*MI));
- assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
- std::string Buffer;
- {
- raw_string_ostream TempStream(Buffer);
- RawPrinter->printInst(MI, TempStream, "", STI);
- }
- StringRef Contents(Buffer);
- auto PacketBundle = Contents.rsplit('\n');
- auto HeadTail = PacketBundle.first.split('\n');
- auto Preamble = "\t{\n\t\t";
- auto Separator = "";
- while(!HeadTail.first.empty()) {
- O << Separator;
- StringRef Inst;
- auto Duplex = HeadTail.first.split('\v');
- if(!Duplex.second.empty()){
- O << Duplex.first << "\n";
- Inst = Duplex.second;
- }
- else
- Inst = Duplex.first;
- O << Preamble;
- O << Inst;
- HeadTail = HeadTail.second.split('\n');
- Preamble = "";
- Separator = "\n\t\t";
- }
- O << "\n\t}" << PacketBundle.second;
-}
-
-void HexagonAsmInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
- RawPrinter->printRegName(O, RegNo);
-}
-
-// Return the minimum value that a constant extendable operand can have
-// without being extended.
-static int getMinValue(uint64_t TSFlags) {
- unsigned isSigned =
- (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
- unsigned bits =
- (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
-
- if (isSigned)
- return -1U << (bits - 1);
-
- return 0;
-}
-
-// Return the maximum value that a constant extendable operand can have
-// without being extended.
-static int getMaxValue(uint64_t TSFlags) {
- unsigned isSigned =
- (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
- unsigned bits =
- (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
-
- if (isSigned)
- return ~(-1U << (bits - 1));
-
- return ~(-1U << bits);
-}
-
-// Return true if the instruction must be extended.
-static bool isExtended(uint64_t TSFlags) {
- return (TSFlags >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
-}
-
-// Currently just used in an assert statement
-static bool isExtendable(uint64_t TSFlags) LLVM_ATTRIBUTE_UNUSED;
-// Return true if the instruction may be extended based on the operand value.
-static bool isExtendable(uint64_t TSFlags) {
- return (TSFlags >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI,
+ MCInstrInfo const &MII,
+ MCRegisterInfo const &MRI)
+ : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) {
}
StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
return MII.getName(Opcode);
}
-void HexagonInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
- OS << getRegisterName(RegNo);
+void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+ O << getRegName(RegNo);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+ return getRegisterName(RegNo);
}
void HexagonInstPrinter::setExtender(MCInst const &MCI) {
HasExtender = HexagonMCInstrInfo::isImmext(MCI);
}
-void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS,
- StringRef Annot,
- MCSubtargetInfo const &STI) {
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot, const MCSubtargetInfo &STI) {
assert(HexagonMCInstrInfo::isBundle(*MI));
assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+ assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
HasExtender = false;
for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
MCInst const &MCI = *I.getInst();
@@ -157,145 +86,148 @@ void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS,
}
}
-void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
- const MCOperand& MO = MI->getOperand(OpNo);
-
+ if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo &&
+ (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)))
+ O << "#";
+ MCOperand const &MO = MI->getOperand(OpNo);
if (MO.isReg()) {
- printRegName(O, MO.getReg());
- } else if(MO.isExpr()) {
- MO.getExpr()->print(O, &MAI);
- } else if(MO.isImm()) {
- printImmOperand(MI, OpNo, O);
- } else {
- llvm_unreachable("Unknown operand");
- }
-}
-
-void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
- const MCOperand& MO = MI->getOperand(OpNo);
-
- if(MO.isExpr()) {
- MO.getExpr()->print(O, &MAI);
- } else if(MO.isImm()) {
- O << MI->getOperand(OpNo).getImm();
+ O << getRegisterName(MO.getReg());
+ } else if (MO.isExpr()) {
+ int64_t Value;
+ if (MO.getExpr()->evaluateAsAbsolute(Value))
+ O << formatImm(Value);
+ else
+ O << *MO.getExpr();
} else {
llvm_unreachable("Unknown operand");
}
}
-void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
- const MCOperand &MO = MI->getOperand(OpNo);
- const MCInstrDesc &MII = getMII().get(MI->getOpcode());
-
- assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) &&
- "Expecting an extendable operand");
-
- if (MO.isExpr() || isExtended(MII.TSFlags)) {
- O << "#";
- } else if (MO.isImm()) {
- int ImmValue = MO.getImm();
- if (ImmValue < getMinValue(MII.TSFlags) ||
- ImmValue > getMaxValue(MII.TSFlags))
- O << "#";
- }
printOperand(MI, OpNo, O);
}
-void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
- unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI,
+ unsigned OpNo,
+ raw_ostream &O) const {
O << MI->getOperand(OpNo).getImm();
}
-void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
O << -MI->getOperand(OpNo).getImm();
}
-void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
O << -1;
}
-void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
- const MCOperand& MO0 = MI->getOperand(OpNo);
- const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<9>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+ O << formatImm(Imm/64);
+}
- printRegName(O, MO0.getReg());
- O << " + #" << MO1.getImm();
+void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<10>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+ O << formatImm(Imm/128);
}
-void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
- const MCOperand& MO0 = MI->getOperand(OpNo);
- const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<10>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+ O << formatImm(Imm/64);
+}
- printRegName(O, MO0.getReg());
- O << ", #" << MO1.getImm();
+void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ int64_t Imm;
+ bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+ Imm = SignExtend64<11>(Imm);
+ assert(Success); (void)Success;
+ assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+ O << formatImm(Imm/128);
}
-void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
- assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
-
printOperand(MI, OpNo, O);
}
-void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
printOperand(MI, OpNo, O);
}
-void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
printOperand(MI, OpNo, O);
}
-void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const {
// Branches can take an immediate operand. This is used by the branch
// selection pass to print $+8, an eight byte displacement from the PC.
llvm_unreachable("Unknown branch operand.");
}
-void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
-}
+void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
-void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
-}
+void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
-void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
-}
+void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {}
-void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo,
raw_ostream &O, bool hi) const {
- assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+ MCOperand const &MO = MI->getOperand(OpNo);
- O << '#' << (hi ? "HI" : "LO") << "(#";
- printOperand(MI, OpNo, O);
+ O << '#' << (hi ? "HI" : "LO") << '(';
+ if (MO.isImm()) {
+ O << '#';
+ printOperand(MI, OpNo, O);
+ } else {
+ printOperand(MI, OpNo, O);
+ assert("Unknown symbol operand");
+ }
O << ')';
}
-void HexagonInstPrinter::printExtBrtarget(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const {
- const MCOperand &MO = MI->getOperand(OpNo);
- const MCInstrDesc &MII = getMII().get(MI->getOpcode());
-
- assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) &&
- "Expecting an extendable operand");
-
- if (MO.isExpr() || isExtended(MII.TSFlags)) {
- O << "##";
+void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const {
+ MCOperand const &MO = MI->getOperand(OpNo);
+ assert (MO.isExpr());
+ MCExpr const &Expr = *MO.getExpr();
+ int64_t Value;
+ if (Expr.evaluateAsAbsolute(Value))
+ O << format("0x%" PRIx64, Value);
+ else {
+ if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
+ if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
+ O << "##";
+ O << Expr;
}
- printOperand(MI, OpNo, O);
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 534ac237d635..5f421184b20a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -7,7 +7,6 @@
//
//===----------------------------------------------------------------------===//
//
-// This class prints an Hexagon MCInst to a .s file.
//
//===----------------------------------------------------------------------===//
@@ -15,17 +14,8 @@
#define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrInfo.h"
namespace llvm {
-class HexagonAsmInstPrinter : public MCInstPrinter {
-public:
- HexagonAsmInstPrinter(MCInstPrinter *RawPrinter);
- void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
- MCSubtargetInfo const &STI) override;
- void printRegName(raw_ostream &O, unsigned RegNo) const override;
- std::unique_ptr<MCInstPrinter> RawPrinter;
-};
/// Prints bundles as a newline separated list of individual instructions
/// Duplexes are separated by a vertical tab \v character
/// A trailing line includes bundle properties such as endloop0/1
@@ -33,68 +23,69 @@ public:
/// r0 = add(r1, r2)
/// r0 = #0 \v jump 0x0
/// :endloop0 :endloop1
- class HexagonInstPrinter : public MCInstPrinter {
- public:
- explicit HexagonInstPrinter(MCAsmInfo const &MAI,
- MCInstrInfo const &MII,
- MCRegisterInfo const &MRI)
- : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
-
- void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
- const MCSubtargetInfo &STI) override;
- virtual StringRef getOpcodeName(unsigned Opcode) const;
- void printInstruction(const MCInst *MI, raw_ostream &O);
- void printRegName(raw_ostream &OS, unsigned RegNo) const override;
- static const char *getRegisterName(unsigned RegNo);
+class HexagonInstPrinter : public MCInstPrinter {
+public:
+ explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII,
+ MCRegisterInfo const &MRI);
+ void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
+ const MCSubtargetInfo &STI) override;
+ virtual StringRef getOpcodeName(unsigned Opcode) const;
+ void printInstruction(MCInst const *MI, raw_ostream &O);
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
- void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
- void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
- void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const;
- void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) const;
- void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
- const;
- void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
- void printExtBrtarget(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+ StringRef getRegName(unsigned RegNo) const;
+ static char const *getRegisterName(unsigned RegNo);
+ void printRegName(raw_ostream &O, unsigned RegNo) const override;
- void printConstantPool(const MCInst *MI, unsigned OpNo,
+ void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printNegImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
raw_ostream &O) const;
+ void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printBranchOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printPredicateOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printGlobalOperand(MCInst const *MI, unsigned OpNo,
+ raw_ostream &O) const;
+ void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+ void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+ void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
- void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
- { printSymbol(MI, OpNo, O, true); }
- void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
- { printSymbol(MI, OpNo, O, false); }
+ void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+ printSymbol(MI, OpNo, O, true);
+ }
+ void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+ printSymbol(MI, OpNo, O, false);
+ }
- const MCInstrInfo &getMII() const {
- return MII;
- }
+ MCAsmInfo const &getMAI() const { return MAI; }
+ MCInstrInfo const &getMII() const { return MII; }
- protected:
- void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
- const;
+protected:
+ void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O,
+ bool hi) const;
- private:
- const MCInstrInfo &MII;
+private:
+ MCInstrInfo const &MII;
- bool HasExtender;
- void setExtender(MCInst const &MCI);
- };
+ bool HasExtender;
+ void setExtender(MCInst const &MCI);
+};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index dc0706994786..a8456b4ead9c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -18,13 +18,14 @@
#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
- class Triple;
+class Triple;
- class HexagonMCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit HexagonMCAsmInfo(const Triple &TT);
- };
+class HexagonMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit HexagonMCAsmInfo(const Triple &TT);
+};
} // namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
new file mode 100644
index 000000000000..46b7b41fec3b
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -0,0 +1,581 @@
+//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCChecker.h"
+
+#include "HexagonBaseInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false),
+ cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity"));
+
+const HexagonMCChecker::PredSense
+ HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
+
+void HexagonMCChecker::init() {
+ // Initialize read-only registers set.
+ ReadOnly.insert(Hexagon::PC);
+
+ // Figure out the loop-registers definitions.
+ if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+ Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0?
+ Defs[Hexagon::LC0].insert(Unconditional);
+ }
+ if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+ Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0?
+ Defs[Hexagon::LC1].insert(Unconditional);
+ }
+
+ if (HexagonMCInstrInfo::isBundle(MCB))
+ // Unfurl a bundle.
+ for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ init(*I.getInst());
+ }
+ else
+ init(MCB);
+}
+
+void HexagonMCChecker::init(MCInst const& MCI) {
+ const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
+ unsigned PredReg = Hexagon::NoRegister;
+ bool isTrue = false;
+
+ // Get used registers.
+ for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+ if (MCI.getOperand(i).isReg()) {
+ unsigned R = MCI.getOperand(i).getReg();
+
+ if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+ // Note an used predicate register.
+ PredReg = R;
+ isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+ // Note use of new predicate register.
+ if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ NewPreds.insert(PredReg);
+ }
+ else
+ // Note register use. Super-registers are not tracked directly,
+ // but their components.
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ // Skip super-registers used indirectly.
+ Uses.insert(*SRI);
+ }
+
+ // Get implicit register definitions.
+ if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+ for (; *ImpDef; ++ImpDef) {
+ unsigned R = *ImpDef;
+
+ if (Hexagon::R31 != R && MCID.isCall())
+ // Any register other than the LR and the PC are actually volatile ones
+ // as defined by the ABI, not modified implicitly by the call insn.
+ continue;
+ if (Hexagon::PC == R)
+ // Branches are the only insns that can change the PC,
+ // otherwise a read-only register.
+ continue;
+
+ if (Hexagon::USR_OVF == R)
+ // Many insns change the USR implicitly, but only one or another flag.
+ // The instruction table models the USR.OVF flag, which can be implicitly
+ // modified more than once, but cannot be modified in the same packet
+ // with an instruction that modifies is explicitly. Deal with such situ-
+ // ations individually.
+ SoftDefs.insert(R);
+ else if (isPredicateRegister(R) &&
+ HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+ // Include implicit late predicates.
+ LatePreds.insert(R);
+ else
+ Defs[R].insert(PredSense(PredReg, isTrue));
+ }
+
+ // Figure out explicit register definitions.
+ for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
+ unsigned R = MCI.getOperand(i).getReg(),
+ S = Hexagon::NoRegister;
+
+ // Note register definitions, direct ones as well as indirect side-effects.
+ // Super-registers are not tracked directly, but their components.
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI) {
+ if (MCSubRegIterator(*SRI, &RI).isValid())
+ // Skip super-registers defined indirectly.
+ continue;
+
+ if (R == *SRI) {
+ if (S == R)
+ // Avoid scoring the defined register multiple times.
+ continue;
+ else
+ // Note that the defined register has already been scored.
+ S = R;
+ }
+
+ if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI)
+ // P3:0 is a special case, since multiple predicate register definitions
+ // in a packet is allowed as the equivalent of their logical "and".
+ // Only an explicit definition of P3:0 is noted as such; if a
+ // side-effect, then note as a soft definition.
+ SoftDefs.insert(*SRI);
+ else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI))
+ // Some insns produce predicates too late to be used in the same packet.
+ LatePreds.insert(*SRI);
+ else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD)
+ // Current loads should be used in the same packet.
+ // TODO: relies on the impossibility of a current and a temporary loads
+ // in the same packet.
+ CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue));
+ else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD)
+ // Temporary loads should be used in the same packet, but don't commit
+ // results, so it should be disregarded if another insn changes the same
+ // register.
+ // TODO: relies on the impossibility of a current and a temporary loads
+ // in the same packet.
+ TmpDefs.insert(*SRI);
+ else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) )
+ // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
+ // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
+ Uses.insert(*SRI);
+ else
+ Defs[*SRI].insert(PredSense(PredReg, isTrue));
+ }
+ }
+
+ // Figure out register definitions that produce new values.
+ if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) {
+ unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+ if (HexagonMCInstrInfo::isCompound(MCII, MCI))
+ compoundRegisterMap(R); // Compound insns have a limited register range.
+
+ for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ // No super-registers defined indirectly.
+ NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+ HexagonMCInstrInfo::isFloat(MCII, MCI)));
+
+ // For fairly unique 2-dot-new producers, example:
+ // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers.
+ if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) {
+ unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg();
+
+ for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid());
+ SRI.isValid();
+ ++SRI)
+ if (!MCSubRegIterator(*SRI, &RI).isValid())
+ NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+ HexagonMCInstrInfo::isFloat(MCII, MCI)));
+ }
+ }
+
+ // Figure out definitions of new predicate registers.
+ if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+ if (MCI.getOperand(i).isReg()) {
+ unsigned P = MCI.getOperand(i).getReg();
+
+ if (isPredicateRegister(P))
+ NewPreds.insert(P);
+ }
+
+ // Figure out uses of new values.
+ if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) {
+ unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+ if (!MCSubRegIterator(N, &RI).isValid()) {
+ // Super-registers cannot use new values.
+ if (MCID.isBranch())
+ NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+ else
+ NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+ }
+ }
+}
+
+HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx,
+ MCRegisterInfo const &ri)
+ : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI),
+ bLoadErrInfo(false) {
+ init();
+}
+
+bool HexagonMCChecker::check() {
+ bool chkB = checkBranches();
+ bool chkP = checkPredicates();
+ bool chkNV = checkNewValues();
+ bool chkR = checkRegisters();
+ bool chkS = checkSolo();
+ bool chkSh = checkShuffle();
+ bool chkSl = checkSlots();
+ bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
+
+ return chk;
+}
+
+bool HexagonMCChecker::checkSlots()
+
+{
+ unsigned slotsUsed = 0;
+ for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) {
+ MCInst const& MCI = *HMI.getInst();
+ if (HexagonMCInstrInfo::isImmext(MCI))
+ continue;
+ if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
+ slotsUsed += 2;
+ else
+ ++slotsUsed;
+ }
+
+ if (slotsUsed > HEXAGON_PACKET_SIZE) {
+ HexagonMCErrInfo errInfo;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS);
+ addErrInfo(errInfo);
+ return false;
+ }
+ return true;
+}
+
+// Check legal use of branches.
+bool HexagonMCChecker::checkBranches() {
+ HexagonMCErrInfo errInfo;
+ if (HexagonMCInstrInfo::isBundle(MCB)) {
+ bool hasConditional = false;
+ unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
+ NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+ Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
+
+ for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
+ i < MCB.size(); ++i) {
+ MCInst const &MCI = *MCB.begin()[i].getInst();
+
+ if (HexagonMCInstrInfo::isImmext(MCI))
+ continue;
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
+ ++Branches;
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
+ HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+ ++NewIndirectBranches;
+ if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
+ ++NewValueBranches;
+
+ if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
+ HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
+ hasConditional = true;
+ Conditional = i; // Record the position of the conditional branch.
+ } else {
+ Unconditional = i; // Record the position of the unconditional branch.
+ }
+ }
+ if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
+ ++Returns;
+ }
+
+ if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
+ if (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+ HexagonMCInstrInfo::isOuterLoop(MCB)) {
+ // Error out if there's any branch in a loop-end packet.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (Branches > 1)
+ if (!hasConditional || Conditional > Unconditional) {
+ // Error out if more than one unconditional branch or
+ // the conditional branch appears after the unconditional one.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check legal use of predicate registers.
+bool HexagonMCChecker::checkPredicates() {
+ HexagonMCErrInfo errInfo;
+ // Check for proper use of new predicate registers.
+ for (const auto& I : NewPreds) {
+ unsigned P = I;
+
+ if (!Defs.count(P) || LatePreds.count(P)) {
+ // Error out if the new predicate register is not defined,
+ // or defined "late"
+ // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ // Check for proper use of auto-anded of predicate registers.
+ for (const auto& I : LatePreds) {
+ unsigned P = I;
+
+ if (LatePreds.count(P) > 1 || Defs.count(P)) {
+ // Error out if predicate register defined "late" multiple times or
+ // defined late and regularly defined
+ // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }".
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check legal use of new values.
+bool HexagonMCChecker::checkNewValues() {
+ HexagonMCErrInfo errInfo;
+ memset(&errInfo, 0, sizeof(errInfo));
+ for (auto& I : NewUses) {
+ unsigned R = I.first;
+ NewSense &US = I.second;
+
+ if (!hasValidNewValueDef(US, NewDefs[R])) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Check for legal register uses and definitions.
+bool HexagonMCChecker::checkRegisters() {
+ HexagonMCErrInfo errInfo;
+ // Check for proper register definitions.
+ for (const auto& I : Defs) {
+ unsigned R = I.first;
+
+ if (ReadOnly.count(R)) {
+ // Error out for definitions of read-only registers.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (isLoopRegister(R) && Defs.count(R) > 1 &&
+ (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+ HexagonMCInstrInfo::isOuterLoop(MCB))) {
+ // Error out for definitions of loop registers at the end of a loop.
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (SoftDefs.count(R)) {
+ // Error out for explicit changes to registers also weakly defined
+ // (e.g., "{ usr = r0; r0 = sfadd(...) }").
+ unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
+ unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+ addErrInfo(errInfo);
+ return false;
+ }
+ if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+ // Check for multiple register definitions.
+ PredSet &PM = Defs[R];
+
+ // Check for multiple unconditional register definitions.
+ if (PM.count(Unconditional)) {
+ // Error out on an unconditional change when there are any other
+ // changes, conditional or not.
+ unsigned UsrR = Hexagon::USR;
+ unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+ addErrInfo(errInfo);
+ return false;
+ }
+ // Check for multiple conditional register definitions.
+ for (const auto& J : PM) {
+ PredSense P = J;
+
+ // Check for multiple uses of the same condition.
+ if (PM.count(P) > 1) {
+ // Error out on conditional changes based on the same predicate
+ // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ // Check for the use of the complementary condition.
+ P.second = !P.second;
+ if (PM.count(P) && PM.size() > 2) {
+ // Error out on conditional changes based on the same predicate
+ // multiple times
+ // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }").
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ }
+ }
+
+ // Check for use of current definitions.
+ for (const auto& I : CurDefs) {
+ unsigned R = I;
+
+ if (!Uses.count(R)) {
+ // Warn on an unused current definition.
+ errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R);
+ addErrInfo(errInfo);
+ return true;
+ }
+ }
+
+ // Check for use of temporary definitions.
+ for (const auto& I : TmpDefs) {
+ unsigned R = I;
+
+ if (!Uses.count(R)) {
+ // special case for vhist
+ bool vHistFound = false;
+ for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) {
+ vHistFound = true; // vhist() implicitly uses ALL REGxx.tmp
+ break;
+ }
+ }
+ // Warn on an unused temporary definition.
+ if (vHistFound == false) {
+ errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R);
+ addErrInfo(errInfo);
+ return true;
+ }
+ }
+ }
+
+ return true;
+}
+
+// Check for legal use of solo insns.
+bool HexagonMCChecker::checkSolo() {
+ HexagonMCErrInfo errInfo;
+ if (HexagonMCInstrInfo::isBundle(MCB) &&
+ HexagonMCInstrInfo::bundleSize(MCB) > 1) {
+ for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+ if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO);
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool HexagonMCChecker::checkShuffle() {
+ HexagonMCErrInfo errInfo;
+ // Branch info is lost when duplexing. The unduplexed insns must be
+ // checked and only branch errors matter for this case.
+ HexagonMCShuffler MCS(MCII, STI, MCB);
+ if (!MCS.check()) {
+ if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+ errInfo.setShuffleError(MCS.getError());
+ addErrInfo(errInfo);
+ return false;
+ }
+ }
+ HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+ if (!MCSDX.check()) {
+ errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+ errInfo.setShuffleError(MCSDX.getError());
+ addErrInfo(errInfo);
+ return false;
+ }
+ return true;
+}
+
+void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
+ switch (Register) {
+ default:
+ break;
+ case Hexagon::R15:
+ Register = Hexagon::R23;
+ break;
+ case Hexagon::R14:
+ Register = Hexagon::R22;
+ break;
+ case Hexagon::R13:
+ Register = Hexagon::R21;
+ break;
+ case Hexagon::R12:
+ Register = Hexagon::R20;
+ break;
+ case Hexagon::R11:
+ Register = Hexagon::R19;
+ break;
+ case Hexagon::R10:
+ Register = Hexagon::R18;
+ break;
+ case Hexagon::R9:
+ Register = Hexagon::R17;
+ break;
+ case Hexagon::R8:
+ Register = Hexagon::R16;
+ break;
+ }
+}
+
+bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
+ const NewSenseList &Defs) const {
+ bool Strict = !RelaxNVChecks;
+
+ for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
+ const NewSense &Def = Defs[i];
+ // NVJ cannot use a new FP value [7.6.1]
+ if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0))
+ continue;
+ // If the definition was not predicated, then it does not matter if
+ // the use is.
+ if (Def.PredReg == 0)
+ return true;
+ // With the strict checks, both the definition and the use must be
+ // predicated on the same register and condition.
+ if (Strict) {
+ if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond)
+ return true;
+ } else {
+ // With the relaxed checks, if the definition was predicated, the only
+ // detectable violation is if the use is predicated on the opposing
+ // condition, otherwise, it's ok.
+ if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond)
+ return true;
+ }
+ }
+ return false;
+}
+
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
new file mode 100644
index 000000000000..5fc0bdeaccbb
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -0,0 +1,218 @@
+//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCHECKER_H
+#define HEXAGONMCCHECKER_H
+
+#include <map>
+#include <set>
+#include <queue>
+#include "MCTargetDesc/HexagonMCShuffler.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCOperandInfo;
+
+typedef struct {
+ unsigned Error, Warning, ShuffleError;
+ unsigned Register;
+} ErrInfo_T;
+
+class HexagonMCErrInfo {
+public:
+ enum {
+ CHECK_SUCCESS = 0,
+ // Errors.
+ CHECK_ERROR_BRANCHES = 0x00001,
+ CHECK_ERROR_NEWP = 0x00002,
+ CHECK_ERROR_NEWV = 0x00004,
+ CHECK_ERROR_REGISTERS = 0x00008,
+ CHECK_ERROR_READONLY = 0x00010,
+ CHECK_ERROR_LOOP = 0x00020,
+ CHECK_ERROR_ENDLOOP = 0x00040,
+ CHECK_ERROR_SOLO = 0x00080,
+ CHECK_ERROR_SHUFFLE = 0x00100,
+ CHECK_ERROR_NOSLOTS = 0x00200,
+ CHECK_ERROR_UNKNOWN = 0x00400,
+ // Warnings.
+ CHECK_WARN_CURRENT = 0x10000,
+ CHECK_WARN_TEMPORARY = 0x20000
+ };
+ ErrInfo_T s;
+
+ void reset() {
+ s.Error = CHECK_SUCCESS;
+ s.Warning = CHECK_SUCCESS;
+ s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS;
+ s.Register = Hexagon::NoRegister;
+ };
+ HexagonMCErrInfo() {
+ reset();
+ };
+
+ void setError(unsigned e, unsigned r = Hexagon::NoRegister)
+ { s.Error = e; s.Register = r; };
+ void setWarning(unsigned w, unsigned r = Hexagon::NoRegister)
+ { s.Warning = w; s.Register = r; };
+ void setShuffleError(unsigned e) { s.ShuffleError = e; };
+};
+
+/// Check for a valid bundle.
+class HexagonMCChecker {
+ /// Insn bundle.
+ MCInst& MCB;
+ MCInst& MCBDX;
+ const MCRegisterInfo& RI;
+ MCInstrInfo const &MCII;
+ MCSubtargetInfo const &STI;
+ bool bLoadErrInfo;
+
+ /// Set of definitions: register #, if predicated, if predicated true.
+ typedef std::pair<unsigned, bool> PredSense;
+ static const PredSense Unconditional;
+ typedef std::multiset<PredSense> PredSet;
+ typedef std::multiset<PredSense>::iterator PredSetIterator;
+
+ typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator;
+ llvm::DenseMap<unsigned, PredSet> Defs;
+
+ /// Information about how a new-value register is defined or used:
+ /// PredReg = predicate register, 0 if use/def not predicated,
+ /// Cond = true/false for if(PredReg)/if(!PredReg) respectively,
+ /// IsFloat = true if definition produces a floating point value
+ /// (not valid for uses),
+ /// IsNVJ = true if the use is a new-value branch (not valid for
+ /// definitions).
+ struct NewSense {
+ unsigned PredReg;
+ bool IsFloat, IsNVJ, Cond;
+ // The special-case "constructors":
+ static NewSense Jmp(bool isNVJ) {
+ NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ,
+ /*Cond=*/ false };
+ return NS;
+ }
+ static NewSense Use(unsigned PR, bool True) {
+ NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false,
+ /*Cond=*/ True };
+ return NS;
+ }
+ static NewSense Def(unsigned PR, bool True, bool Float) {
+ NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false,
+ /*Cond=*/ True };
+ return NS;
+ }
+ };
+ /// Set of definitions that produce new register:
+ typedef llvm::SmallVector<NewSense,2> NewSenseList;
+ typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator;
+ llvm::DenseMap<unsigned, NewSenseList> NewDefs;
+
+ /// Set of weak definitions whose clashes should be enforced selectively.
+ typedef std::set<unsigned>::iterator SoftDefsIterator;
+ std::set<unsigned> SoftDefs;
+
+ /// Set of current definitions committed to the register file.
+ typedef std::set<unsigned>::iterator CurDefsIterator;
+ std::set<unsigned> CurDefs;
+
+ /// Set of temporary definitions not committed to the register file.
+ typedef std::set<unsigned>::iterator TmpDefsIterator;
+ std::set<unsigned> TmpDefs;
+
+ /// Set of new predicates used.
+ typedef std::set<unsigned>::iterator NewPredsIterator;
+ std::set<unsigned> NewPreds;
+
+ /// Set of predicates defined late.
+ typedef std::multiset<unsigned>::iterator LatePredsIterator;
+ std::multiset<unsigned> LatePreds;
+
+ /// Set of uses.
+ typedef std::set<unsigned>::iterator UsesIterator;
+ std::set<unsigned> Uses;
+
+ /// Set of new values used: new register, if new-value jump.
+ typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator;
+ llvm::DenseMap<unsigned, NewSense> NewUses;
+
+ /// Pre-defined set of read-only registers.
+ typedef std::set<unsigned>::iterator ReadOnlyIterator;
+ std::set<unsigned> ReadOnly;
+
+ std::queue<ErrInfo_T> ErrInfoQ;
+ HexagonMCErrInfo CrntErrInfo;
+
+ void getErrInfo() {
+ if (bLoadErrInfo == true) {
+ if (ErrInfoQ.empty()) {
+ CrntErrInfo.reset();
+ } else {
+ CrntErrInfo.s = ErrInfoQ.front();
+ ErrInfoQ.pop();
+ }
+ }
+ bLoadErrInfo = false;
+ }
+
+ void init();
+ void init(MCInst const&);
+
+ // Checks performed.
+ bool checkBranches();
+ bool checkPredicates();
+ bool checkNewValues();
+ bool checkRegisters();
+ bool checkSolo();
+ bool checkShuffle();
+ bool checkSlots();
+
+ static void compoundRegisterMap(unsigned&);
+
+ bool isPredicateRegister(unsigned R) const {
+ return (Hexagon::P0 == R || Hexagon::P1 == R ||
+ Hexagon::P2 == R || Hexagon::P3 == R);
+ };
+ bool isLoopRegister(unsigned R) const {
+ return (Hexagon::SA0 == R || Hexagon::LC0 == R ||
+ Hexagon::SA1 == R || Hexagon::LC1 == R);
+ };
+
+ bool hasValidNewValueDef(const NewSense &Use,
+ const NewSenseList &Defs) const;
+
+ public:
+ explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
+ const MCRegisterInfo& ri);
+
+ bool check();
+
+ /// add a new error/warning
+ void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
+
+ /// Return the error code for the last operation in the insn bundle.
+ unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; };
+ unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; };
+ unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; };
+ unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; };
+ bool getNextErrInfo() {
+ bLoadErrInfo = true;
+ return (ErrInfoQ.empty()) ? false : (getErrInfo(), true);
+ }
+};
+
+}
+
+#endif // HEXAGONMCCHECKER_H
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 9fc4e2aeaba6..c2c6275e7e8d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -96,6 +96,12 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
assert(!HexagonMCInstrInfo::isBundle(HMB));
uint64_t Binary;
+ // Compound instructions are limited to using registers 0-7 and 16-23
+ // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
+ static unsigned RegMap[8] = {Hexagon::R8, Hexagon::R9, Hexagon::R10,
+ Hexagon::R11, Hexagon::R12, Hexagon::R13,
+ Hexagon::R14, Hexagon::R15};
+
// Pseudo instructions don't get encoded and shouldn't be here
// in the first place!
assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
@@ -104,6 +110,16 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(
" `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
"\n");
+ if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
+ for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
+ if (HMB.getOperand(i).isReg()) {
+ unsigned Reg =
+ MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
+ if ((Reg <= 23) && (Reg >= 16))
+ HMB.getOperand(i).setReg(RegMap[Reg - 16]);
+ }
+ }
+
if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
// Calculate the new value distance to the associated producer
MCOperand &MCO =
@@ -318,7 +334,7 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
// The only relocs left should be GP relative:
default:
if (MCID.mayStore() || MCID.mayLoad()) {
- for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses;
+ for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
++ImpUses) {
if (*ImpUses == Hexagon::GP) {
switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
@@ -389,10 +405,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
return cast<MCConstantExpr>(ME)->getValue();
}
if (MK == MCExpr::Binary) {
- unsigned Res;
- Res = getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI);
- Res +=
- getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI);
+ getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI);
+ getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI);
return 0;
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 886f8db3bc63..d194bea3d8dc 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -115,8 +115,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
SrcReg = MI.getOperand(1).getReg();
if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MI.getOperand(2).isImm() && ((isUInt<5>(MI.getOperand(2).getImm())) ||
- (MI.getOperand(2).getImm() == -1)))
+ (HexagonMCInstrInfo::inRange<5>(MI, 2) ||
+ HexagonMCInstrInfo::minConstant(MI, 2) == -1))
return HexagonII::HCG_A;
break;
case Hexagon::A2_tfr:
@@ -134,8 +134,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
return false;
// Rd = #u6
DstReg = MI.getOperand(0).getReg();
- if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() <= 63 &&
- MI.getOperand(1).getImm() >= 0 &&
+ if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 &&
+ HexagonMCInstrInfo::minConstant(MI, 1) >= 0 &&
HexagonMCInstrInfo::isIntRegForSubInst(DstReg))
return HexagonII::HCG_A;
break;
@@ -145,9 +145,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
DstReg = MI.getOperand(0).getReg();
Src1Reg = MI.getOperand(1).getReg();
if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
- MI.getOperand(2).isImm() &&
HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
- (MI.getOperand(2).getImm() == 0))
+ HexagonMCInstrInfo::minConstant(MI, 2) == 0)
return HexagonII::HCG_A;
break;
// The fact that .new form is used pretty much guarantees
@@ -206,6 +205,8 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
MCInst *CompoundInsn = 0;
unsigned compoundOpcode;
MCOperand Rs, Rt;
+ int64_t Value;
+ bool Success;
switch (L.getOpcode()) {
default:
@@ -277,7 +278,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
case Hexagon::C2_cmpeqi:
DEBUG(dbgs() << "CX: C2_cmpeqi\n");
- if (L.getOperand(2).getImm() == -1)
+ Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success);
+ if (Value == -1)
compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)];
else
compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
@@ -286,14 +290,17 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
CompoundInsn = new (Context) MCInst;
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
- if (L.getOperand(2).getImm() != -1)
+ if (Value != -1)
CompoundInsn->addOperand(L.getOperand(2));
CompoundInsn->addOperand(R.getOperand(1));
break;
case Hexagon::C2_cmpgti:
DEBUG(dbgs() << "CX: C2_cmpgti\n");
- if (L.getOperand(2).getImm() == -1)
+ Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ (void)Success;
+ assert(Success);
+ if (Value == -1)
compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)];
else
compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
@@ -302,7 +309,7 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
CompoundInsn = new (Context) MCInst;
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
- if (L.getOperand(2).getImm() != -1)
+ if (Value != -1)
CompoundInsn->addOperand(L.getOperand(2));
CompoundInsn->addOperand(R.getOperand(1));
break;
@@ -404,7 +411,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
/// additional slot.
void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
MCContext &Context, MCInst &MCI) {
- assert(MCI.getOpcode() == Hexagon::BUNDLE &&
+ assert(HexagonMCInstrInfo::isBundle(MCI) &&
"Non-Bundle where Bundle expected");
// By definition a compound must have 2 insn.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 7e9247cef6ad..e6194f61a6ba 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -26,7 +26,7 @@ using namespace Hexagon;
#define DEBUG_TYPE "hexagon-mcduplex-info"
// pair table of subInstructions with opcodes
-static std::pair<unsigned, unsigned> opcodeData[] = {
+static const std::pair<unsigned, unsigned> opcodeData[] = {
std::make_pair((unsigned)V4_SA1_addi, 0),
std::make_pair((unsigned)V4_SA1_addrx, 6144),
std::make_pair((unsigned)V4_SA1_addsp, 3072),
@@ -81,8 +81,7 @@ static std::pair<unsigned, unsigned> opcodeData[] = {
std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
static std::map<unsigned, unsigned>
- subinstOpcodeMap(opcodeData,
- opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0]));
+ subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData));
bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
switch (Ga) {
@@ -195,15 +194,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
// Special case this one from Group L2.
// Rd = memw(r29+#u5:2)
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
- if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
- MCI.getOperand(2).isImm() &&
- isShiftedUInt<5, 2>(MCI.getOperand(2).getImm())) {
+ if (HexagonMCInstrInfo::isIntReg(SrcReg) &&
+ Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) {
return HexagonII::HSIG_L2;
}
// Rd = memw(Rs+#u4:2)
if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- (MCI.getOperand(2).isImm() &&
- isShiftedUInt<4, 2>(MCI.getOperand(2).getImm()))) {
+ inRange<4, 2>(MCI, 2)) {
return HexagonII::HSIG_L1;
}
}
@@ -214,7 +211,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() && isUInt<4>(MCI.getOperand(2).getImm())) {
+ inRange<4>(MCI, 2)) {
return HexagonII::HSIG_L1;
}
break;
@@ -235,8 +232,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() &&
- isShiftedUInt<3, 1>(MCI.getOperand(2).getImm())) {
+ inRange<3, 1>(MCI, 2)) {
return HexagonII::HSIG_L2;
}
break;
@@ -246,7 +242,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() && isUInt<3>(MCI.getOperand(2).getImm())) {
+ inRange<3>(MCI, 2)) {
return HexagonII::HSIG_L2;
}
break;
@@ -256,8 +252,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
- MCI.getOperand(2).isImm() &&
- isShiftedUInt<5, 3>(MCI.getOperand(2).getImm())) {
+ inRange<5, 3>(MCI, 2)) {
return HexagonII::HSIG_L2;
}
break;
@@ -326,15 +321,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
Src2Reg = MCI.getOperand(2).getReg();
if (HexagonMCInstrInfo::isIntReg(Src1Reg) &&
HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
- Hexagon::R29 == Src1Reg && MCI.getOperand(1).isImm() &&
- isShiftedUInt<5, 2>(MCI.getOperand(1).getImm())) {
+ Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) {
return HexagonII::HSIG_S2;
}
// memw(Rs+#u4:2) = Rt
if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
- MCI.getOperand(1).isImm() &&
- isShiftedUInt<4, 2>(MCI.getOperand(1).getImm())) {
+ inRange<4, 2>(MCI, 1)) {
return HexagonII::HSIG_S1;
}
break;
@@ -344,7 +337,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
Src2Reg = MCI.getOperand(2).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
- MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm())) {
+ inRange<4>(MCI, 1)) {
return HexagonII::HSIG_S1;
}
break;
@@ -363,8 +356,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
Src2Reg = MCI.getOperand(2).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
- MCI.getOperand(1).isImm() &&
- isShiftedUInt<3, 1>(MCI.getOperand(1).getImm())) {
+ inRange<3, 1>(MCI, 1)) {
return HexagonII::HSIG_S2;
}
break;
@@ -374,8 +366,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
Src2Reg = MCI.getOperand(2).getReg();
if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) &&
HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg &&
- MCI.getOperand(1).isImm() &&
- isShiftedInt<6, 3>(MCI.getOperand(1).getImm())) {
+ inSRange<6, 3>(MCI, 1)) {
return HexagonII::HSIG_S2;
}
break;
@@ -383,9 +374,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
// memw(Rs+#u4:2) = #U1
Src1Reg = MCI.getOperand(0).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
- MCI.getOperand(1).isImm() &&
- isShiftedUInt<4, 2>(MCI.getOperand(1).getImm()) &&
- MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) {
+ inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) {
return HexagonII::HSIG_S2;
}
break;
@@ -393,16 +382,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
// memb(Rs+#u4) = #U1
Src1Reg = MCI.getOperand(0).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
- MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) &&
- MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) {
+ inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) {
return HexagonII::HSIG_S2;
}
break;
case Hexagon::S2_allocframe:
- if (MCI.getOperand(0).isImm() &&
- isShiftedUInt<5, 3>(MCI.getOperand(0).getImm())) {
+ if (inRange<5, 3>(MCI, 0))
return HexagonII::HSIG_S2;
- }
break;
//
// Group A:
@@ -428,8 +414,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
// Rd = add(r29,#u6:2)
if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
- MCI.getOperand(2).isImm() &&
- isShiftedUInt<6, 2>(MCI.getOperand(2).getImm())) {
+ inRange<6, 2>(MCI, 2)) {
return HexagonII::HSIG_A;
}
// Rx = add(Rx,#s7)
@@ -439,8 +424,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
// Rd = add(Rs,#1)
// Rd = add(Rs,#-1)
if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
- (MCI.getOperand(2).getImm() == -1))) {
+ (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) {
return HexagonII::HSIG_A;
}
}
@@ -460,8 +444,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
- (MCI.getOperand(2).getImm() == 255))) {
+ (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) {
return HexagonII::HSIG_A;
}
break;
@@ -491,8 +474,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
DstReg = MCI.getOperand(0).getReg(); // Rd
PredReg = MCI.getOperand(1).getReg(); // P0
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
- Hexagon::P0 == PredReg && MCI.getOperand(2).isImm() &&
- MCI.getOperand(2).getImm() == 0) {
+ Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) {
return HexagonII::HSIG_A;
}
break;
@@ -502,7 +484,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (Hexagon::P0 == DstReg &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- MCI.getOperand(2).isImm() && isUInt<2>(MCI.getOperand(2).getImm())) {
+ inRange<2>(MCI, 2)) {
return HexagonII::HSIG_A;
}
break;
@@ -511,10 +493,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
// Rdd = combine(#u2,#U2)
DstReg = MCI.getOperand(0).getReg();
if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
- // TODO: Handle Globals/Symbols
- (MCI.getOperand(1).isImm() && isUInt<2>(MCI.getOperand(1).getImm())) &&
- ((MCI.getOperand(2).isImm() &&
- isUInt<2>(MCI.getOperand(2).getImm())))) {
+ inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) {
return HexagonII::HSIG_A;
}
break;
@@ -524,7 +503,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(1).getReg();
if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- (MCI.getOperand(2).isImm() && MCI.getOperand(2).getImm() == 0)) {
+ minConstant(MCI, 2) == 0) {
return HexagonII::HSIG_A;
}
break;
@@ -534,7 +513,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
SrcReg = MCI.getOperand(2).getReg();
if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
- (MCI.getOperand(1).isImm() && MCI.getOperand(1).getImm() == 0)) {
+ minConstant(MCI, 1) == 0) {
return HexagonII::HSIG_A;
}
break;
@@ -556,19 +535,17 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
}
bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
-
unsigned DstReg, SrcReg;
-
switch (potentialDuplex.getOpcode()) {
case Hexagon::A2_addi:
// testing for case of: Rx = add(Rx,#s7)
DstReg = potentialDuplex.getOperand(0).getReg();
SrcReg = potentialDuplex.getOperand(1).getReg();
if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
- if (potentialDuplex.getOperand(2).isExpr())
+ int64_t Value;
+ if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value))
return true;
- if (potentialDuplex.getOperand(2).isImm() &&
- !(isShiftedInt<7, 0>(potentialDuplex.getOperand(2).getImm())))
+ if (!isShiftedInt<7, 0>(Value))
return true;
}
break;
@@ -576,15 +553,14 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
DstReg = potentialDuplex.getOperand(0).getReg();
if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
- if (potentialDuplex.getOperand(1).isExpr())
+ int64_t Value;
+ if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value))
return true;
// Check for case of Rd = #-1.
- if (potentialDuplex.getOperand(1).isImm() &&
- (potentialDuplex.getOperand(1).getImm() == -1))
+ if (Value == -1)
return false;
// Check for case of Rd = #u6.
- if (potentialDuplex.getOperand(1).isImm() &&
- !isShiftedUInt<6, 0>(potentialDuplex.getOperand(1).getImm()))
+ if (!isShiftedUInt<6, 0>(Value))
return true;
}
break;
@@ -712,19 +688,23 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
MCInst Result;
+ bool Absolute;
+ int64_t Value;
switch (Inst.getOpcode()) {
default:
// dbgs() << "opcode: "<< Inst->getOpcode() << "\n";
llvm_unreachable("Unimplemented subinstruction \n");
break;
case Hexagon::A2_addi:
- if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 1) {
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 1) {
Result.setOpcode(Hexagon::V4_SA1_inc);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
break;
} // 1,2 SUBInst $Rd = add($Rs, #1)
- else if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == -1) {
+ else if (Value == -1) {
Result.setOpcode(Hexagon::V4_SA1_dec);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
@@ -754,7 +734,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 0);
break; // 1 SUBInst allocframe(#$u5_3)
case Hexagon::A2_andir:
- if (Inst.getOperand(2).getImm() == 255) {
+ if (minConstant(Inst, 2) == 255) {
Result.setOpcode(Hexagon::V4_SA1_zxtb);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
@@ -772,26 +752,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
break; // 2,3 SUBInst p0 = cmp.eq($Rs, #$u2)
case Hexagon::A4_combineii:
case Hexagon::A2_combineii:
- if (Inst.getOperand(1).getImm() == 1) {
+ Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 1) {
Result.setOpcode(Hexagon::V4_SA1_combine1i);
addOps(Result, Inst, 0);
addOps(Result, Inst, 2);
break; // 1,3 SUBInst $Rdd = combine(#1, #$u2)
}
-
- if (Inst.getOperand(1).getImm() == 3) {
+ if (Value == 3) {
Result.setOpcode(Hexagon::V4_SA1_combine3i);
addOps(Result, Inst, 0);
addOps(Result, Inst, 2);
break; // 1,3 SUBInst $Rdd = combine(#3, #$u2)
}
- if (Inst.getOperand(1).getImm() == 0) {
+ if (Value == 0) {
Result.setOpcode(Hexagon::V4_SA1_combine0i);
addOps(Result, Inst, 0);
addOps(Result, Inst, 2);
break; // 1,3 SUBInst $Rdd = combine(#0, #$u2)
}
- if (Inst.getOperand(1).getImm() == 2) {
+ if (Value == 2) {
Result.setOpcode(Hexagon::V4_SA1_combine2i);
addOps(Result, Inst, 0);
addOps(Result, Inst, 2);
@@ -894,12 +875,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
break; // 1,2,3 SUBInst $Rd = memw($Rs + #$u4_2)
}
case Hexagon::S4_storeirb_io:
- if (Inst.getOperand(2).getImm() == 0) {
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 0) {
Result.setOpcode(Hexagon::V4_SS2_storebi0);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
break; // 1,2 SUBInst memb($Rs + #$u4_0)=#0
- } else if (Inst.getOperand(2).getImm() == 1) {
+ } else if (Value == 1) {
Result.setOpcode(Hexagon::V4_SS2_storebi1);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
@@ -923,12 +906,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 2);
break; // 1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
case Hexagon::S4_storeiri_io:
- if (Inst.getOperand(2).getImm() == 0) {
+ Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+ assert(Absolute);(void)Absolute;
+ if (Value == 0) {
Result.setOpcode(Hexagon::V4_SS2_storewi0);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
break; // 3 1,2 SUBInst memw($Rs + #$u4_2)=#0
- } else if (Inst.getOperand(2).getImm() == 1) {
+ } else if (Value == 1) {
Result.setOpcode(Hexagon::V4_SS2_storewi1);
addOps(Result, Inst, 0);
addOps(Result, Inst, 1);
@@ -983,7 +968,8 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
addOps(Result, Inst, 0);
break; // 2 SUBInst if (p0) $Rd = #0
case Hexagon::A2_tfrsi:
- if (Inst.getOperand(1).isImm() && Inst.getOperand(1).getImm() == -1) {
+ Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+ if (Absolute && Value == -1) {
Result.setOpcode(Hexagon::V4_SA1_setin1);
addOps(Result, Inst, 0);
break; // 2 1 SUBInst $Rd = #-1
@@ -1044,6 +1030,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
<< "\n");
bisReversable = false;
}
+ if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
+ bisReversable = false;
// Try in order.
if (isOrderedDuplexPair(
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index bf51c3515e95..eaa3550d07f6 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -37,9 +37,7 @@ static cl::opt<unsigned>
void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
const MCSubtargetInfo &STI) {
- MCInst HMI;
- HMI.setOpcode(Hexagon::BUNDLE);
- HMI.addOperand(MCOperand::createImm(0));
+ MCInst HMI = HexagonMCInstrInfo::createBundle();
MCInst *MCB;
if (MCK.getOpcode() != Hexagon::BUNDLE) {
@@ -50,7 +48,7 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
// Examines packet and pad the packet, if needed, when an
// end-loop is in the bundle.
- HexagonMCInstrInfo::padEndloop(*MCB);
+ HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
HexagonMCShuffle(*MCII, STI, *MCB);
assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
@@ -60,9 +58,9 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
if (Extended) {
if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
- HexagonMCInstrInfo::clampExtended(*MCII, *SubInst);
+ HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
} else {
- HexagonMCInstrInfo::clampExtended(*MCII, *MCI);
+ HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
}
Extended = false;
} else {
@@ -114,7 +112,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
MCSection *Section = getAssembler().getContext().getELFSection(
SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
SwitchSection(Section);
- AssignSection(Symbol, Section);
+ AssignFragment(Symbol, getCurrentFragment());
MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment);
SwitchSection(CrntSection);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
new file mode 100644
index 000000000000..fc6262657514
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -0,0 +1,49 @@
+//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes
+//----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-mcexpr"
+
+HexagonNoExtendOperand *HexagonNoExtendOperand::Create(MCExpr const *Expr,
+ MCContext &Ctx) {
+ return new (Ctx) HexagonNoExtendOperand(Expr);
+}
+
+bool HexagonNoExtendOperand::evaluateAsRelocatableImpl(
+ MCValue &Res, MCAsmLayout const *Layout, MCFixup const *Fixup) const {
+ return Expr->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+void HexagonNoExtendOperand::visitUsedExpr(MCStreamer &Streamer) const {}
+
+MCFragment *llvm::HexagonNoExtendOperand::findAssociatedFragment() const {
+ return Expr->findAssociatedFragment();
+}
+
+void HexagonNoExtendOperand::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+MCExpr const *HexagonNoExtendOperand::getExpr() const { return Expr; }
+
+bool HexagonNoExtendOperand::classof(MCExpr const *E) {
+ return E->getKind() == MCExpr::Target;
+}
+
+HexagonNoExtendOperand::HexagonNoExtendOperand(MCExpr const *Expr)
+ : Expr(Expr) {}
+
+void HexagonNoExtendOperand::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+ Expr->print(OS, MAI);
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
new file mode 100644
index 000000000000..60f180fb2bc4
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -0,0 +1,35 @@
+//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+class MCInst;
+class HexagonNoExtendOperand : public MCTargetExpr {
+public:
+ static HexagonNoExtendOperand *Create(MCExpr const *Expr, MCContext &Ctx);
+ void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
+ void visitUsedExpr(MCStreamer &Streamer) const override;
+ MCFragment *findAssociatedFragment() const override;
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+ static bool classof(MCExpr const *E);
+ MCExpr const *getExpr() const;
+
+private:
+ HexagonNoExtendOperand(MCExpr const *Expr);
+ MCExpr const *Expr;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 48b15f85a783..e6842076db2a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -15,17 +15,37 @@
#include "Hexagon.h"
#include "HexagonBaseInfo.h"
+#include "HexagonMCChecker.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
+void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value,
+ MCContext &Context) {
+ MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context)));
+}
+
+void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
+ MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI) {
+ assert(HexagonMCInstrInfo::isBundle(MCB));
+ MCOperand const &exOp =
+ MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+
+ // Create the extender.
+ MCInst *XMCI =
+ new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp));
+
+ MCB.addOperand(MCOperand::createInst(XMCI));
+}
+
iterator_range<MCInst::const_iterator>
HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
assert(isBundle(MCI));
- return iterator_range<MCInst::const_iterator>(
- MCI.begin() + bundleInstructionsOffset, MCI.end());
+ return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
}
size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
@@ -35,7 +55,40 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
return (1);
}
-void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) {
+bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
+ MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &MCB,
+ HexagonMCChecker *Check) {
+ // Examine the packet and convert pairs of instructions to compound
+ // instructions when possible.
+ if (!HexagonDisableCompound)
+ HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
+ // Check the bundle for errors.
+ bool CheckOk = Check ? Check->check() : true;
+ if (!CheckOk)
+ return false;
+ HexagonMCShuffle(MCII, STI, MCB);
+ // Examine the packet and convert pairs of instructions to duplex
+ // instructions when possible.
+ MCInst InstBundlePreDuplex = MCInst(MCB);
+ if (!HexagonDisableDuplex) {
+ SmallVector<DuplexCandidate, 8> possibleDuplexes;
+ possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+ HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
+ }
+ // Examines packet and pad the packet, if needed, when an
+ // end-loop is in the bundle.
+ HexagonMCInstrInfo::padEndloop(Context, MCB);
+ // If compounding and duplexing didn't reduce the size below
+ // 4 or less we have a packet that is too big.
+ if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
+ return false;
+ HexagonMCShuffle(MCII, STI, MCB);
+ return true;
+}
+
+void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
+ MCContext &Context, MCInst &MCI) {
assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
HexagonMCInstrInfo::isExtended(MCII, MCI));
MCOperand &exOp =
@@ -43,13 +96,20 @@ void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) {
// If the extended value is a constant, then use it for the extended and
// for the extender instructions, masking off the lower 6 bits and
// including the assumed bits.
- if (exOp.isImm()) {
+ int64_t Value;
+ if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
- int64_t Bits = exOp.getImm();
- exOp.setImm((Bits & 0x3f) << Shift);
+ exOp.setExpr(MCConstantExpr::create((Value & 0x3f) << Shift, Context));
}
}
+MCInst HexagonMCInstrInfo::createBundle() {
+ MCInst Result;
+ Result.setOpcode(Hexagon::BUNDLE);
+ Result.addOperand(MCOperand::createImm(0));
+ return Result;
+}
+
MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
MCInst const &inst0,
MCInst const &inst1) {
@@ -64,6 +124,27 @@ MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
return duplexInst;
}
+MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
+ MCInst const &Inst,
+ MCOperand const &MO) {
+ assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
+ HexagonMCInstrInfo::isExtended(MCII, Inst));
+
+ MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
+ MCInst XMI;
+ XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
+ HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
+ ? Hexagon::A4_ext_b
+ : Hexagon::A4_ext);
+ if (MO.isImm())
+ XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
+ else if (MO.isExpr())
+ XMI.addOperand(MCOperand::createExpr(MO.getExpr()));
+ else
+ llvm_unreachable("invalid extendable operand");
+ return XMI;
+}
+
MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
size_t Index) {
assert(Index <= bundleSize(MCB));
@@ -76,6 +157,13 @@ MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
return nullptr;
}
+void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context,
+ MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI, bool MustExtend) {
+ if (isConstExtended(MCII, MCI) || MustExtend)
+ addConstExtender(Context, MCII, MCB, MCI);
+}
+
HexagonII::MemAccessSize
HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -186,6 +274,25 @@ MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
return (MCO);
}
+/// Return the new value or the newly produced value.
+unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI);
+ MCOperand const &MCO = MCI.getOperand(O);
+
+ assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+ HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) &&
+ MCO.isReg());
+ return (MCO);
+}
+
int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
MCInst const &MCI) {
const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -242,6 +349,13 @@ bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
}
+/// Return whether the insn produces a second value.
+bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2);
+}
+
MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
assert(isBundle(MCB));
assert(Index < HEXAGON_PACKET_SIZE);
@@ -261,6 +375,11 @@ bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
}
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
+}
+
bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
(Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
@@ -282,14 +401,21 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
MCInst const &MCI) {
if (HexagonMCInstrInfo::isExtended(MCII, MCI))
return true;
-
- if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+ // Branch insns are handled as necessary by relaxation.
+ if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
+ (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
+ (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+ HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
+ return false;
+ // Otherwise loop instructions and other CR insts are handled by relaxation
+ else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) &&
+ (MCI.getOpcode() != Hexagon::C4_addipc))
+ return false;
+ else if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
return false;
- short ExtOpNum = HexagonMCInstrInfo::getCExtOpNum(MCII, MCI);
- int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
- int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
- MCOperand const &MO = MCI.getOperand(ExtOpNum);
+ MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
// We could be using an instruction with an extendable immediate and shoehorn
// a global address into it. If it is a global address it will be constant
@@ -297,15 +423,13 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
// We currently only handle isGlobal() because it is the only kind of
// object we are going to end up with here for now.
// In the future we probably should add isSymbol(), etc.
- if (MO.isExpr())
+ assert(!MO.isImm());
+ int64_t Value;
+ if (!MO.getExpr()->evaluateAsAbsolute(Value))
return true;
-
- // If the extendable operand is not 'Immediate' type, the instruction should
- // have 'isExtended' flag set.
- assert(MO.isImm() && "Extendable operand must be Immediate type");
-
- int ImmValue = MO.getImm();
- return (ImmValue < MinValue || ImmValue > MaxValue);
+ int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
+ int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
+ return (MinValue > Value || Value > MaxValue);
}
bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
@@ -374,6 +498,19 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
}
+bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask);
+}
+
+/// Return whether the insn is newly predicated.
+bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII,
+ MCInst const &MCI) {
+ const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+ return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
MCInst const &MCI) {
const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -394,6 +531,18 @@ bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
}
+bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ auto Flags = MCI.getOperand(0).getImm();
+ return (Flags & memReorderDisabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) {
+ assert(isBundle(MCI));
+ auto Flags = MCI.getOperand(0).getImm();
+ return (Flags & memStoreReorderEnabledMask) != 0;
+}
+
bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
@@ -405,7 +554,28 @@ bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
}
-void HexagonMCInstrInfo::padEndloop(MCInst &MCB) {
+bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
+ if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
+ (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
+ return true;
+ return false;
+}
+
+int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
+ auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
+ << 8;
+ if (MCI.size() <= Index)
+ return Sentinal;
+ MCOperand const &MCO = MCI.getOperand(Index);
+ if (!MCO.isExpr())
+ return Sentinal;
+ int64_t Value;
+ if (!MCO.getExpr()->evaluateAsAbsolute(Value))
+ return Sentinal;
+ return Value;
+}
+
+void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
MCInst Nop;
Nop.setOpcode(Hexagon::A2_nop);
assert(isBundle(MCB));
@@ -413,7 +583,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB) {
(HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
((HexagonMCInstrInfo::isOuterLoop(MCB) &&
(HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
- MCB.addOperand(MCOperand::createInst(new MCInst(Nop)));
+ MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop)));
}
bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
@@ -456,6 +626,20 @@ void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) {
Operand.setImm(Operand.getImm() | innerLoopMask);
}
+void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | memReorderDisabledMask);
+ assert(isMemReorderDisabled(MCI));
+}
+
+void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
+ assert(isBundle(MCI));
+ MCOperand &Operand = MCI.getOperand(0);
+ Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
+ assert(isMemStoreReorderEnabled(MCI));
+}
+
void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
assert(isBundle(MCI));
MCOperand &Operand = MCI.getOperand(0);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 32d61a4a7be5..0237b2884a3b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -14,9 +14,11 @@
#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+#include "HexagonMCExpr.h"
#include "llvm/MC/MCInst.h"
namespace llvm {
+class HexagonMCChecker;
class MCContext;
class MCInstrDesc;
class MCInstrInfo;
@@ -39,20 +41,47 @@ int64_t const innerLoopMask = 1 << innerLoopOffset;
size_t const outerLoopOffset = 1;
int64_t const outerLoopMask = 1 << outerLoopOffset;
+// do not reorder memory load/stores by default load/stores are re-ordered
+// and by default loads can be re-ordered
+size_t const memReorderDisabledOffset = 2;
+int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset;
+
+// allow re-ordering of memory stores by default stores cannot be re-ordered
+size_t const memStoreReorderEnabledOffset = 3;
+int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset;
+
size_t const bundleInstructionsOffset = 1;
+void addConstant(MCInst &MI, uint64_t Value, MCContext &Context);
+void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI);
+
// Returns a iterator range of instructions in this bundle
iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
// Returns the number of instructions in the bundle
size_t bundleSize(MCInst const &MCI);
+// Put the packet in to canonical form, compound, duplex, pad, and shuffle
+bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+ MCContext &Context, MCInst &MCB,
+ HexagonMCChecker *Checker);
+
// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCInst &MCI);
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+ MCInst const &MCI, bool MustExtend);
// Create a duplex instruction given the two subinsts
MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
MCInst const &inst1);
+MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
+ MCOperand const &MO);
// Convert this instruction in to a duplex subinst
MCInst deriveSubInst(MCInst const &Inst);
@@ -108,6 +137,9 @@ unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI);
// Return the operand that consumes or produces a new value.
MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
+unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
+MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
+ MCInst const &MCI);
int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -125,6 +157,7 @@ bool hasImmExt(MCInst const &MCI);
// Return whether the instruction is a legal new-value producer.
bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
// Return the instruction at Index
MCInst const &instruction(MCInst const &MCB, size_t Index);
@@ -134,10 +167,24 @@ bool isBundle(MCInst const &MCI);
// Return whether the insn is an actual insn.
bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
// Return the duplex iclass given the two duplex classes
unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
+int64_t minConstant(MCInst const &MCI, size_t Index);
+template <unsigned N, unsigned S>
+bool inRange(MCInst const &MCI, size_t Index) {
+ return isShiftedUInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N, unsigned S>
+bool inSRange(MCInst const &MCI, size_t Index) {
+ return isShiftedInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
+ return isUInt<N>(minConstant(MCI, Index));
+}
+
// Return whether the instruction needs to be constant extended.
bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -173,6 +220,8 @@ bool isIntReg(unsigned Reg);
// Is this register suitable for use in a duplex subinst
bool isIntRegForSubInst(unsigned Reg);
+bool isMemReorderDisabled(MCInst const &MCI);
+bool isMemStoreReorderEnabled(MCInst const &MCI);
// Return whether the insn is a new-value consumer.
bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -191,6 +240,8 @@ bool isOuterLoop(MCInst const &MCI);
// Return whether this instruction is predicated
bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
// Return whether the predicate sense is true
bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -209,9 +260,10 @@ bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
/// Return whether the insn can be packaged only with an A-type insn in slot #1.
bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
// Pad the bundle with nops to satisfy endloop requirements
-void padEndloop(MCInst &MCI);
+void padEndloop(MCContext &Context, MCInst &MCI);
bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -220,6 +272,8 @@ void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
// Marks a bundle as endloop0
void setInnerLoop(MCInst &MCI);
+void setMemReorderDisabled(MCInst &MCI);
+void setMemStoreReorderEnabled(MCInst &MCI);
// Marks a bundle as endloop1
void setOuterLoop(MCInst &MCI);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 53305d85fd80..9a292577a8f3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -40,6 +40,20 @@ using namespace llvm;
#define GET_REGINFO_MC_DESC
#include "HexagonGenRegisterInfo.inc"
+cl::opt<bool> llvm::HexagonDisableCompound
+ ("mno-compound",
+ cl::desc("Disable looking for compound instructions for Hexagon"));
+
+cl::opt<bool> llvm::HexagonDisableDuplex
+ ("mno-pairing",
+ cl::desc("Disable looking for duplex instructions for Hexagon"));
+
+StringRef HEXAGON_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
+ if (CPU.empty())
+ CPU = "hexagonv60";
+ return CPU;
+}
+
MCInstrInfo *llvm::createHexagonMCInstrInfo() {
MCInstrInfo *X = new MCInstrInfo();
InitHexagonMCInstrInfo(X);
@@ -54,6 +68,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+ CPU = HEXAGON_MC::selectHexagonCPU(TT, CPU);
return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
}
@@ -76,28 +91,23 @@ public:
StringRef Contents(Buffer);
auto PacketBundle = Contents.rsplit('\n');
auto HeadTail = PacketBundle.first.split('\n');
- auto Preamble = "\t{\n\t\t";
- auto Separator = "";
- while(!HeadTail.first.empty()) {
- OS << Separator;
- StringRef Inst;
+ StringRef Separator = "\n";
+ StringRef Indent = "\t\t";
+ OS << "\t{\n";
+ while (!HeadTail.first.empty()) {
+ StringRef InstTxt;
auto Duplex = HeadTail.first.split('\v');
- if(!Duplex.second.empty()){
- OS << Duplex.first << "\n";
- Inst = Duplex.second;
- }
- else {
- if(!HeadTail.first.startswith("immext"))
- Inst = Duplex.first;
+ if (!Duplex.second.empty()) {
+ OS << Indent << Duplex.first << Separator;
+ InstTxt = Duplex.second;
+ } else if (!HeadTail.first.trim().startswith("immext")) {
+ InstTxt = Duplex.first;
}
- OS << Preamble;
- OS << Inst;
+ if (!InstTxt.empty())
+ OS << Indent << InstTxt << Separator;
HeadTail = HeadTail.second.split('\n');
- Preamble = "";
- Separator = "\n\t\t";
}
- if(HexagonMCInstrInfo::bundleSize(Inst) != 0)
- OS << "\n\t}" << PacketBundle.second;
+ OS << "\t}" << PacketBundle.second;
}
};
}
@@ -154,9 +164,9 @@ static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT,
CodeModel::Model CM,
CodeGenOpt::Level OL) {
MCCodeGenInfo *X = new MCCodeGenInfo();
- // For the time being, use static relocations, since there's really no
- // support for PIC yet.
- X->initMCCodeGenInfo(Reloc::Static, CM, OL);
+ if (RM == Reloc::Default)
+ RM = Reloc::Static;
+ X->initMCCodeGenInfo(RM, CM, OL);
return X;
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index cb626503313f..a005a014416b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -16,6 +16,8 @@
#include <cstdint>
+#include "llvm/Support/CommandLine.h"
+
namespace llvm {
struct InstrItinerary;
struct InstrStage;
@@ -33,22 +35,27 @@ class raw_ostream;
class raw_pwrite_stream;
extern Target TheHexagonTarget;
-
+extern cl::opt<bool> HexagonDisableCompound;
+extern cl::opt<bool> HexagonDisableDuplex;
extern const InstrStage HexagonStages[];
MCInstrInfo *createHexagonMCInstrInfo();
-MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII,
- MCRegisterInfo const &MRI,
+MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
MCContext &MCT);
-MCAsmBackend *createHexagonAsmBackend(Target const &T,
- MCRegisterInfo const &MRI,
+MCAsmBackend *createHexagonAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
const Triple &TT, StringRef CPU);
MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
uint8_t OSABI, StringRef CPU);
+namespace HEXAGON_MC {
+ StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+}
+
} // End llvm namespace
// Define symbolic names for Hexagon registers. This defines a mapping from
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 41112ac0b46e..6ceb848ba20c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -27,6 +27,7 @@
using namespace llvm;
+namespace {
// Insn shuffling priority.
class HexagonBid {
// The priority is directly proportional to how restricted the insn is based
@@ -75,6 +76,7 @@ public:
return false;
};
};
+} // end anonymous namespace
unsigned HexagonResource::setWeight(unsigned s) {
const unsigned SlotWeight = 8;
@@ -93,6 +95,60 @@ unsigned HexagonResource::setWeight(unsigned s) {
return (Weight);
}
+HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL;
+
+bool HexagonCVIResource::SetUp = HexagonCVIResource::setup();
+
+bool HexagonCVIResource::setup() {
+ assert(!TUL);
+ TUL = new (TypeUnitsAndLanes);
+
+ (*TUL)[HexagonII::TypeCVI_VA] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+ (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+ (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+ (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+ (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_LD] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+ (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_ST] =
+ UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+ (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+ (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+ (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+
+ return true;
+}
+
+HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s,
+ MCInst const *id)
+ : HexagonResource(s) {
+ unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+ if (TUL->count(T)) {
+ // For an HVX insn.
+ Valid = true;
+ setUnits((*TUL)[T].first);
+ setLanes((*TUL)[T].second);
+ setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+ setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+ } else {
+ // For core insns.
+ Valid = false;
+ setUnits(0);
+ setLanes(0);
+ setLoad(false);
+ setStore(false);
+ }
+}
+
HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
MCSubtargetInfo const &STI)
: MCII(MCII), STI(STI) {
@@ -107,7 +163,7 @@ void HexagonShuffler::reset() {
void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
unsigned S, bool X) {
- HexagonInstr PI(ID, Extender, S, X);
+ HexagonInstr PI(MCII, ID, Extender, S, X);
Packet.push_back(PI);
}
@@ -126,6 +182,8 @@ bool HexagonShuffler::check() {
// Number of memory operations, loads, solo loads, stores, solo stores, single
// stores.
unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+ // Number of HVX loads, HVX stores.
+ unsigned CVIloads = 0, CVIstores = 0;
// Number of duplex insns, solo insns.
unsigned duplex = 0, solo = 0;
// Number of insns restricting other insns in the packet to A and X types,
@@ -168,6 +226,12 @@ bool HexagonShuffler::check() {
case HexagonII::TypeJ:
++jumps;
break;
+ case HexagonII::TypeCVI_VM_VP_LDU:
+ ++onlyNo1;
+ case HexagonII::TypeCVI_VM_LD:
+ case HexagonII::TypeCVI_VM_TMP_LD:
+ case HexagonII::TypeCVI_VM_CUR_LD:
+ ++CVIloads;
case HexagonII::TypeLD:
++loads;
++memory;
@@ -176,6 +240,11 @@ bool HexagonShuffler::check() {
if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
break;
+ case HexagonII::TypeCVI_VM_STU:
+ ++onlyNo1;
+ case HexagonII::TypeCVI_VM_ST:
+ case HexagonII::TypeCVI_VM_NEW_ST:
+ ++CVIstores;
case HexagonII::TypeST:
++stores;
++memory;
@@ -203,9 +272,9 @@ bool HexagonShuffler::check() {
}
// Check if the packet is legal.
- if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) ||
- (solo && size() > 1) || (onlyAX && neitherAnorX > 1) ||
- (onlyAX && xtypeFloat)) {
+ if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+ (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+ (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
Error = SHUFFLE_ERROR_INVALID;
return false;
}
@@ -336,6 +405,19 @@ bool HexagonShuffler::check() {
return false;
}
}
+ // Verify the CVI slot subscriptions.
+ {
+ HexagonUnitAuction AuctionCVI;
+
+ std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+ for (iterator I = begin(); I != end(); ++I)
+ for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+ if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+ Error = SHUFFLE_ERROR_SLOTS;
+ return false;
+ }
+ }
Error = SHUFFLE_SUCCESS;
return true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 8b6c72ee25e6..174f10fb2580 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -51,6 +51,44 @@ public:
};
};
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+ typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+ typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+ // Available HVX slots.
+ enum {
+ CVI_NONE = 0,
+ CVI_XLANE = 1 << 0,
+ CVI_SHIFT = 1 << 1,
+ CVI_MPY0 = 1 << 2,
+ CVI_MPY1 = 1 << 3
+ };
+
+ static bool SetUp;
+ static bool setup();
+ static TypeUnitsAndLanes *TUL;
+
+ // Count of adjacent slots that the insn requires to be executed.
+ unsigned Lanes;
+ // Flag whether the insn is a load or a store.
+ bool Load, Store;
+ // Flag whether the HVX resources are valid.
+ bool Valid;
+
+ void setLanes(unsigned l) { Lanes = l; };
+ void setLoad(bool f = true) { Load = f; };
+ void setStore(bool f = true) { Store = f; };
+
+public:
+ HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id);
+
+ bool isValid() const { return (Valid); };
+ unsigned getLanes() const { return (Lanes); };
+ bool mayLoad() const { return (Load); };
+ bool mayStore() const { return (Store); };
+};
+
// Handle to an insn used by the shuffling algorithm.
class HexagonInstr {
friend class HexagonShuffler;
@@ -58,12 +96,14 @@ class HexagonInstr {
MCInst const *ID;
MCInst const *Extender;
HexagonResource Core;
+ HexagonCVIResource CVI;
bool SoloException;
public:
- HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s,
- bool x = false)
- : ID(id), Extender(Extender), Core(s), SoloException(x){};
+ HexagonInstr(MCInstrInfo const &MCII, MCInst const *id,
+ MCInst const *Extender, unsigned s, bool x = false)
+ : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id),
+ SoloException(x){};
MCInst const *getDesc() const { return (ID); };
@@ -79,6 +119,10 @@ public:
static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
return (HexagonResource::lessUnits(A.Core, B.Core));
};
+ // Check if the handles are in ascending order by HVX slots.
+ static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+ return (HexagonResource::lessUnits(A.CVI, B.CVI));
+ };
};
// Bundle shuffler.
@@ -108,6 +152,8 @@ public:
SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
SHUFFLE_ERROR_NOSLOTS, ///< No free slots for other insns.
SHUFFLE_ERROR_SLOTS, ///< Over-subscribed slots.
+ SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60).
+ SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict
SHUFFLE_ERROR_UNKNOWN ///< Unknown error.
};
diff --git a/lib/Target/Hexagon/Makefile b/lib/Target/Hexagon/Makefile
index 04b41e5986ac..c53b8e56aafc 100644
--- a/lib/Target/Hexagon/Makefile
+++ b/lib/Target/Hexagon/Makefile
@@ -13,6 +13,7 @@ TARGET = Hexagon
# Make sure that tblgen is run, first thing.
BUILT_SOURCES = HexagonGenRegisterInfo.inc \
HexagonGenInstrInfo.inc \
+ HexagonGenAsmMatcher.inc \
HexagonGenAsmWriter.inc \
HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \
HexagonGenCallingConv.inc \
@@ -20,6 +21,6 @@ BUILT_SOURCES = HexagonGenRegisterInfo.inc \
HexagonGenMCCodeEmitter.inc \
HexagonGenDisassemblerTables.inc
-DIRS = TargetInfo MCTargetDesc Disassembler
+DIRS = TargetInfo MCTargetDesc Disassembler AsmParser
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index f05d7a465252..eb794ebc7216 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -22,6 +22,7 @@ subdirectories =
AMDGPU
ARM
AArch64
+ AVR
BPF
CppBackend
Hexagon
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 70141a998e4a..72afec18becb 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -17,8 +17,6 @@
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
- class MCOperand;
-
class MSP430InstPrinter : public MCInstPrinter {
public:
MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index ff5b0b6d858c..183dee36a047 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -17,13 +17,14 @@
#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
- class Triple;
+class Triple;
- class MSP430MCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit MSP430MCAsmInfo(const Triple &TT);
- };
+class MSP430MCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit MSP430MCAsmInfo(const Triple &TT);
+};
} // namespace llvm
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index ffcf22216d4f..606abc250d98 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -64,7 +64,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
unsigned FuncSize = 0;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
- MachineBasicBlock *MBB = MFI;
+ MachineBasicBlock *MBB = &*MFI;
unsigned BlockSize = 0;
for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 29bc8b33988a..18f38b7e90da 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -69,10 +69,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
computeRegisterProperties(STI.getRegisterInfo());
// Provide all sorts of operation actions
-
- // Division is expensive
- setIntDivIsCheap(false);
-
setStackPointerRegisterToSaveRestore(MSP430::SP);
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
@@ -508,9 +504,10 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
// Create the SelectionDAG nodes corresponding to a load
//from this parameter
SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
- InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ InVal = DAG.getLoad(
+ VA.getLocVT(), dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, false, 0);
}
InVals.push_back(InVal);
@@ -1231,8 +1228,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
}
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator I = BB;
- ++I;
+ MachineFunction::iterator I = ++BB->getIterator();
// Create loop block
MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -1320,8 +1316,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// to set, the condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator I = BB;
- ++I;
+ MachineFunction::iterator I = ++BB->getIterator();
// thisMBB:
// ...
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 72b1780fd1ce..d4f82bda1ec9 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -44,11 +44,10 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
if (RC == &MSP430::GR16RegClass)
BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
@@ -72,11 +71,10 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
if (RC == &MSP430::GR16RegClass)
BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 54154a8afac1..47b0e270c5b3 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -50,9 +50,9 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const {
MCSymbol *MSP430MCInstLower::
GetJumpTableSymbol(const MachineOperand &MO) const {
- const DataLayout *DL = Printer.TM.getDataLayout();
+ const DataLayout &DL = Printer.getDataLayout();
SmallString<256> Name;
- raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
<< Printer.getFunctionNumber() << '_'
<< MO.getIndex();
@@ -67,9 +67,9 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
MCSymbol *MSP430MCInstLower::
GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
- const DataLayout *DL = Printer.TM.getDataLayout();
+ const DataLayout &DL = Printer.getDataLayout();
SmallString<256> Name;
- raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
+ raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI"
<< Printer.getFunctionNumber() << '_'
<< MO.getIndex();
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
index 0f7539908458..b442fc03b257 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===//
+//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index fcc5f5b88600..2d937318c7e5 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/MSP430/README.txt b/lib/Target/MSP430/README.txt
index e9899247fb5e..458f3f8b4c7e 100644
--- a/lib/Target/MSP430/README.txt
+++ b/lib/Target/MSP430/README.txt
@@ -2,7 +2,7 @@
// MSP430 backend.
//===---------------------------------------------------------------------===//
-DISCLAIMER: Thid backend should be considered as highly experimental. I never
+DISCLAIMER: This backend should be considered as highly experimental. I never
seen nor worked with this MCU, all information was gathered from datasheet
only. The original intention of making this backend was to write documentation
of form "How to write backend for dummies" :) Thes notes hopefully will be
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 5107d2ae58c3..d4e061f00d3a 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -11,6 +11,7 @@
#include "MCTargetDesc/MipsMCExpr.h"
#include "MCTargetDesc/MipsMCTargetDesc.h"
#include "MipsRegisterInfo.h"
+#include "MipsTargetObjectFile.h"
#include "MipsTargetStreamer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
@@ -106,7 +107,6 @@ class MipsAsmParser : public MCTargetAsmParser {
return static_cast<MipsTargetStreamer &>(TS);
}
- MCSubtargetInfo &STI;
MipsABIInfo ABI;
SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
@@ -114,6 +114,12 @@ class MipsAsmParser : public MCTargetAsmParser {
// selected. This usually happens after an '.end func'
// directive.
bool IsLittleEndian;
+ bool IsPicEnabled;
+ bool IsCpRestoreSet;
+ int CpRestoreOffset;
+ unsigned CpSaveLocation;
+ /// If true, then CpSaveLocation is a register, otherwise it's an offset.
+ bool CpSaveLocationIsRegister;
// Print a warning along with its fix-it message at the given range.
void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
@@ -141,50 +147,41 @@ class MipsAsmParser : public MCTargetAsmParser {
bool ParseDirective(AsmToken DirectiveID) override;
- MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy
+ OperandMatchResultTy parseMemOperand(OperandVector &Operands);
+ OperandMatchResultTy
matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
StringRef Identifier, SMLoc S);
-
- MipsAsmParser::OperandMatchResultTy
- matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
-
- MipsAsmParser::OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy parseImm(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy
- parseRegisterPair (OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy
- parseMovePRegPair(OperandVector &Operands);
-
- MipsAsmParser::OperandMatchResultTy
- parseRegisterList (OperandVector &Operands);
+ OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+ SMLoc S);
+ OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+ OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
+ OperandMatchResultTy parseInvNum(OperandVector &Operands);
+ OperandMatchResultTy parseLSAImm(OperandVector &Operands);
+ OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
+ OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
+ OperandMatchResultTy parseRegisterList(OperandVector &Operands);
bool searchSymbolAlias(OperandVector &Operands);
bool parseOperand(OperandVector &, StringRef Mnemonic);
- bool needsExpansion(MCInst &Inst);
+ enum MacroExpanderResultTy {
+ MER_NotAMacro,
+ MER_Success,
+ MER_Fail,
+ };
// Expands assembly pseudo instructions.
- // Returns false on success, true otherwise.
- bool expandInstruction(MCInst &Inst, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions);
+ MacroExpanderResultTy
+ tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
- bool Is32BitImm, SMLoc IDLoc,
+ bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
@@ -194,11 +191,10 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
- bool expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions);
+ bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+ const MCOperand &Offset, bool Is32BitAddress,
+ SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions);
- bool expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions);
bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
@@ -209,24 +205,43 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
+ bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+
bool expandBranchImm(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
bool expandCondBranches(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
- bool expandUlhu(MCInst &Inst, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions);
+ bool expandDiv(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions, const bool IsMips64,
+ const bool Signed);
+
+ bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
bool expandUlw(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
+ bool expandRotation(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+ bool expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+ bool expandDRotation(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+ bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+
void createNop(bool hasShortDelaySlot, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
void createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg,
bool Is64Bit, SmallVectorImpl<MCInst> &Instructions);
+ void createCpRestoreMemOp(bool IsLoad, int StackOffset, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions);
+
bool reportParseError(Twine ErrorMsg);
bool reportParseError(SMLoc Loc, Twine ErrorMsg);
@@ -239,8 +254,11 @@ class MipsAsmParser : public MCTargetAsmParser {
bool parseSetMips0Directive();
bool parseSetArchDirective();
bool parseSetFeature(uint64_t Feature);
+ bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
bool parseDirectiveCpLoad(SMLoc Loc);
+ bool parseDirectiveCpRestore(SMLoc Loc);
bool parseDirectiveCPSetup();
+ bool parseDirectiveCPReturn();
bool parseDirectiveNaN();
bool parseDirectiveSet();
bool parseDirectiveOption();
@@ -337,6 +355,7 @@ class MipsAsmParser : public MCTargetAsmParser {
// FeatureMipsGP64 | FeatureMips1)
// Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4).
void selectArch(StringRef ArchFeature) {
+ MCSubtargetInfo &STI = copySTI();
FeatureBitset FeatureBits = STI.getFeatureBits();
FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
STI.setFeatureBits(FeatureBits);
@@ -346,7 +365,8 @@ class MipsAsmParser : public MCTargetAsmParser {
}
void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
- if (!(STI.getFeatureBits()[Feature])) {
+ if (!(getSTI().getFeatureBits()[Feature])) {
+ MCSubtargetInfo &STI = copySTI();
setAvailableFeatures(
ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
@@ -354,7 +374,8 @@ class MipsAsmParser : public MCTargetAsmParser {
}
void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
- if (STI.getFeatureBits()[Feature]) {
+ if (getSTI().getFeatureBits()[Feature]) {
+ MCSubtargetInfo &STI = copySTI();
setAvailableFeatures(
ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
@@ -363,26 +384,25 @@ class MipsAsmParser : public MCTargetAsmParser {
void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
setFeatureBits(Feature, FeatureString);
- AssemblerOptions.front()->setFeatures(STI.getFeatureBits());
+ AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
}
void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
clearFeatureBits(Feature, FeatureString);
- AssemblerOptions.front()->setFeatures(STI.getFeatureBits());
+ AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
}
public:
enum MipsMatchResultTy {
- Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY
+ Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "MipsGenAsmMatcher.inc"
#undef GET_OPERAND_DIAGNOSTIC_TYPES
-
};
- MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+ MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(sti),
+ : MCTargetAsmParser(Options, sti),
ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
sti.getCPU(), Options)) {
MCAsmParserExtension::Initialize(parser);
@@ -390,15 +410,15 @@ public:
parser.addAliasForDirective(".asciiz", ".asciz");
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
// Remember the initial assembler options. The user can not modify these.
AssemblerOptions.push_back(
- llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
-
+ llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
// Create an assembler options environment for the user to modify.
AssemblerOptions.push_back(
- llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
+ llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
getTargetStreamer().updateABIInfo(*this);
@@ -407,6 +427,12 @@ public:
CurrentFn = nullptr;
+ IsPicEnabled =
+ (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_);
+
+ IsCpRestoreSet = false;
+ CpRestoreOffset = -1;
+
Triple TheTriple(sti.getTargetTriple());
if ((TheTriple.getArch() == Triple::mips) ||
(TheTriple.getArch() == Triple::mips64))
@@ -418,70 +444,103 @@ public:
/// True if all of $fcc0 - $fcc7 exist for the current ISA.
bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
- bool isGP64bit() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
- bool isFP64bit() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
+ bool isGP64bit() const {
+ return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
+ }
+ bool isFP64bit() const {
+ return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
+ }
const MipsABIInfo &getABI() const { return ABI; }
bool isABI_N32() const { return ABI.IsN32(); }
bool isABI_N64() const { return ABI.IsN64(); }
bool isABI_O32() const { return ABI.IsO32(); }
- bool isABI_FPXX() const { return STI.getFeatureBits()[Mips::FeatureFPXX]; }
+ bool isABI_FPXX() const {
+ return getSTI().getFeatureBits()[Mips::FeatureFPXX];
+ }
bool useOddSPReg() const {
- return !(STI.getFeatureBits()[Mips::FeatureNoOddSPReg]);
+ return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]);
}
bool inMicroMipsMode() const {
- return STI.getFeatureBits()[Mips::FeatureMicroMips];
+ return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
+ }
+ bool hasMips1() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips1];
+ }
+ bool hasMips2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips2];
+ }
+ bool hasMips3() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips3];
+ }
+ bool hasMips4() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips4];
+ }
+ bool hasMips5() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMips5];
}
- bool hasMips1() const { return STI.getFeatureBits()[Mips::FeatureMips1]; }
- bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
- bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
- bool hasMips4() const { return STI.getFeatureBits()[Mips::FeatureMips4]; }
- bool hasMips5() const { return STI.getFeatureBits()[Mips::FeatureMips5]; }
bool hasMips32() const {
- return STI.getFeatureBits()[Mips::FeatureMips32];
+ return getSTI().getFeatureBits()[Mips::FeatureMips32];
}
bool hasMips64() const {
- return STI.getFeatureBits()[Mips::FeatureMips64];
+ return getSTI().getFeatureBits()[Mips::FeatureMips64];
}
bool hasMips32r2() const {
- return STI.getFeatureBits()[Mips::FeatureMips32r2];
+ return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
}
bool hasMips64r2() const {
- return STI.getFeatureBits()[Mips::FeatureMips64r2];
+ return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
}
bool hasMips32r3() const {
- return (STI.getFeatureBits()[Mips::FeatureMips32r3]);
+ return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
}
bool hasMips64r3() const {
- return (STI.getFeatureBits()[Mips::FeatureMips64r3]);
+ return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
}
bool hasMips32r5() const {
- return (STI.getFeatureBits()[Mips::FeatureMips32r5]);
+ return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
}
bool hasMips64r5() const {
- return (STI.getFeatureBits()[Mips::FeatureMips64r5]);
+ return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
}
bool hasMips32r6() const {
- return STI.getFeatureBits()[Mips::FeatureMips32r6];
+ return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
}
bool hasMips64r6() const {
- return STI.getFeatureBits()[Mips::FeatureMips64r6];
+ return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
}
- bool hasDSP() const { return STI.getFeatureBits()[Mips::FeatureDSP]; }
- bool hasDSPR2() const { return STI.getFeatureBits()[Mips::FeatureDSPR2]; }
- bool hasMSA() const { return STI.getFeatureBits()[Mips::FeatureMSA]; }
+ bool hasDSP() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSP];
+ }
+ bool hasDSPR2() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
+ }
+ bool hasDSPR3() const {
+ return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
+ }
+ bool hasMSA() const {
+ return getSTI().getFeatureBits()[Mips::FeatureMSA];
+ }
bool hasCnMips() const {
- return (STI.getFeatureBits()[Mips::FeatureCnMips]);
+ return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
+ }
+
+ bool inPicMode() {
+ return IsPicEnabled;
}
bool inMips16Mode() const {
- return STI.getFeatureBits()[Mips::FeatureMips16];
+ return getSTI().getFeatureBits()[Mips::FeatureMips16];
+ }
+
+ bool useTraps() const {
+ return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV];
}
bool useSoftFloat() const {
- return STI.getFeatureBits()[Mips::FeatureSoftFloat];
+ return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
}
/// Warn if RegIndex is the same as the current AT.
@@ -869,6 +928,16 @@ public:
Inst.addOperand(MCOperand::createReg(getHWRegsReg()));
}
+ template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+ void addConstantUImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ uint64_t Imm = getConstantImm() - Offset;
+ Imm &= (1 << Bits) - 1;
+ Imm += Offset;
+ Imm += AdjustOffset;
+ Inst.addOperand(MCOperand::createImm(Imm));
+ }
+
void addImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCExpr *Expr = getImm();
@@ -878,7 +947,9 @@ public:
void addMemOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(getMemBase()->getGPR32Reg()));
+ Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit()
+ ? getMemBase()->getGPR64Reg()
+ : getMemBase()->getGPR32Reg()));
const MCExpr *Expr = getMemOff();
addExpr(Inst, Expr);
@@ -924,10 +995,16 @@ public:
bool isRegIdx() const { return Kind == k_RegisterIndex; }
bool isImm() const override { return Kind == k_Immediate; }
bool isConstantImm() const {
- return isImm() && dyn_cast<MCConstantExpr>(getImm());
+ return isImm() && isa<MCConstantExpr>(getImm());
+ }
+ bool isConstantImmz() const {
+ return isConstantImm() && getConstantImm() == 0;
}
- template <unsigned Bits> bool isUImm() const {
- return isImm() && isConstantImm() && isUInt<Bits>(getConstantImm());
+ template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
+ return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
+ }
+ template <unsigned Bits> bool isConstantSImm() const {
+ return isConstantImm() && isInt<Bits>(getConstantImm());
}
bool isToken() const override {
// Note: It's not possible to pretend that other operand kinds are tokens.
@@ -936,10 +1013,15 @@ public:
}
bool isMem() const override { return Kind == k_Memory; }
bool isConstantMemOff() const {
- return isMem() && dyn_cast<MCConstantExpr>(getMemOff());
+ return isMem() && isa<MCConstantExpr>(getMemOff());
}
template <unsigned Bits> bool isMemWithSimmOffset() const {
- return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
+ return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
+ && getMemBase()->isGPRAsmReg();
+ }
+ template <unsigned Bits> bool isMemWithSimmOffsetGPR() const {
+ return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) &&
+ getMemBase()->isGPRAsmReg();
}
bool isMemWithGRPMM16Base() const {
return isMem() && getMemBase()->isMM16AsmReg();
@@ -953,13 +1035,23 @@ public:
&& (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
&& (getMemBase()->getGPR32Reg() == Mips::SP);
}
+ template <unsigned Bits, unsigned ShiftLeftAmount>
+ bool isScaledUImm() const {
+ return isConstantImm() &&
+ isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
+ }
bool isRegList16() const {
if (!isRegList())
return false;
int Size = RegList.List->size();
- if (Size < 2 || Size > 5 || *RegList.List->begin() != Mips::S0 ||
- RegList.List->back() != Mips::RA)
+ if (Size < 2 || Size > 5)
+ return false;
+
+ unsigned R0 = RegList.List->front();
+ unsigned R1 = RegList.List->back();
+ if (!((R0 == Mips::S0 && R1 == Mips::RA) ||
+ (R0 == Mips::S0_64 && R1 == Mips::RA_64)))
return false;
int PrevReg = *RegList.List->begin();
@@ -1304,9 +1396,123 @@ static bool hasShortDelaySlot(unsigned Opcode) {
}
}
+static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) {
+ if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+ return &SRExpr->getSymbol();
+ }
+
+ if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+ const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS());
+ const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS());
+
+ if (LHSSym)
+ return LHSSym;
+
+ if (RHSSym)
+ return RHSSym;
+
+ return nullptr;
+ }
+
+ if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+ return getSingleMCSymbol(UExpr->getSubExpr());
+
+ return nullptr;
+}
+
+static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
+ if (isa<MCSymbolRefExpr>(Expr))
+ return 1;
+
+ if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr))
+ return countMCSymbolRefExpr(BExpr->getLHS()) +
+ countMCSymbolRefExpr(BExpr->getRHS());
+
+ if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+ return countMCSymbolRefExpr(UExpr->getSubExpr());
+
+ return 0;
+}
+
+namespace {
+void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ MCInst tmpInst;
+ tmpInst.setOpcode(Opcode);
+ tmpInst.addOperand(MCOperand::createReg(Reg0));
+ tmpInst.addOperand(Op1);
+ tmpInst.setLoc(IDLoc);
+ Instructions.push_back(tmpInst);
+}
+
+void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, Instructions);
+}
+
+void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, Instructions);
+}
+
+void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ MCInst tmpInst;
+ tmpInst.setOpcode(Opcode);
+ tmpInst.addOperand(MCOperand::createImm(Imm1));
+ tmpInst.addOperand(MCOperand::createImm(Imm2));
+ tmpInst.setLoc(IDLoc);
+ Instructions.push_back(tmpInst);
+}
+
+void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ MCInst tmpInst;
+ tmpInst.setOpcode(Opcode);
+ tmpInst.addOperand(MCOperand::createReg(Reg0));
+ tmpInst.setLoc(IDLoc);
+ Instructions.push_back(tmpInst);
+}
+
+void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+ SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+ MCInst tmpInst;
+ tmpInst.setOpcode(Opcode);
+ tmpInst.addOperand(MCOperand::createReg(Reg0));
+ tmpInst.addOperand(MCOperand::createReg(Reg1));
+ tmpInst.addOperand(Op2);
+ tmpInst.setLoc(IDLoc);
+ Instructions.push_back(tmpInst);
+}
+
+void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+ SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+ emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc,
+ Instructions);
+}
+
+void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+ SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+ emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc,
+ Instructions);
+}
+
+void emitAppropriateDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+ SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+ if (ShiftAmount >= 32) {
+ emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc,
+ Instructions);
+ return;
+ }
+
+ emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, Instructions);
+}
+} // end anonymous namespace.
+
bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions) {
const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+ bool ExpandedJalSym = false;
Inst.setLoc(IDLoc);
@@ -1365,12 +1571,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BEQZ16_MM:
+ case Mips::BEQZC16_MMR6:
case Mips::BNEZ16_MM:
+ case Mips::BNEZC16_MMR6:
assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
Offset = Inst.getOperand(1);
if (!Offset.isImm())
break; // We'll deal with this situation later on when applying fixups.
- if (!isIntN(8, Offset.getImm()))
+ if (!isInt<8>(Offset.getImm()))
return Error(IDLoc, "branch target out of range");
if (OffsetToAlignment(Offset.getImm(), 2LL))
return Error(IDLoc, "branch to misaligned address");
@@ -1415,32 +1623,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
}
break;
- case Mips::CINS:
- case Mips::CINS32:
- case Mips::EXTS:
- case Mips::EXTS32:
- assert(MCID.getNumOperands() == 4 && "unexpected number of operands");
- // Check length
- Opnd = Inst.getOperand(3);
- if (!Opnd.isImm())
- return Error(IDLoc, "expected immediate operand kind");
- Imm = Opnd.getImm();
- if (Imm < 0 || Imm > 31)
- return Error(IDLoc, "immediate operand value out of range");
- // Check position
- Opnd = Inst.getOperand(2);
- if (!Opnd.isImm())
- return Error(IDLoc, "expected immediate operand kind");
- Imm = Opnd.getImm();
- if (Imm < 0 || Imm > (Opcode == Mips::CINS ||
- Opcode == Mips::EXTS ? 63 : 31))
- return Error(IDLoc, "immediate operand value out of range");
- if (Imm > 31) {
- Inst.setOpcode(Opcode == Mips::CINS ? Mips::CINS32 : Mips::EXTS32);
- Inst.getOperand(2).setImm(Imm - 32);
- }
- break;
-
case Mips::SEQi:
case Mips::SNEi:
assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
@@ -1454,6 +1636,81 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
}
}
+ // This expansion is not in a function called by tryExpandInstruction()
+ // because the pseudo-instruction doesn't have a distinct opcode.
+ if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) &&
+ inPicMode()) {
+ warnIfNoMacro(IDLoc);
+
+ const MCExpr *JalExpr = Inst.getOperand(0).getExpr();
+
+ // We can do this expansion if there's only 1 symbol in the argument
+ // expression.
+ if (countMCSymbolRefExpr(JalExpr) > 1)
+ return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");
+
+ // FIXME: This is checking the expression can be handled by the later stages
+ // of the assembler. We ought to leave it to those later stages but
+ // we can't do that until we stop evaluateRelocExpr() rewriting the
+ // expressions into non-equivalent forms.
+ const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
+
+ // FIXME: Add support for label+offset operands (currently causes an error).
+ // FIXME: Add support for forward-declared local symbols.
+ // FIXME: Add expansion for when the LargeGOT option is enabled.
+ if (JalSym->isInSection() || JalSym->isTemporary()) {
+ if (isABI_O32()) {
+ // If it's a local symbol and the O32 ABI is being used, we expand to:
+ // lw $25, 0($gp)
+ // R_(MICRO)MIPS_GOT16 label
+ // addiu $25, $25, 0
+ // R_(MICRO)MIPS_LO16 label
+ // jalr $25
+ const MCExpr *Got16RelocExpr = evaluateRelocExpr(JalExpr, "got");
+ const MCExpr *Lo16RelocExpr = evaluateRelocExpr(JalExpr, "lo");
+
+ emitRRX(Mips::LW, Mips::T9, Mips::GP,
+ MCOperand::createExpr(Got16RelocExpr), IDLoc, Instructions);
+ emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
+ MCOperand::createExpr(Lo16RelocExpr), IDLoc, Instructions);
+ } else if (isABI_N32() || isABI_N64()) {
+ // If it's a local symbol and the N32/N64 ABIs are being used,
+ // we expand to:
+ // lw/ld $25, 0($gp)
+ // R_(MICRO)MIPS_GOT_DISP label
+ // jalr $25
+ const MCExpr *GotDispRelocExpr = evaluateRelocExpr(JalExpr, "got_disp");
+
+ emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+ MCOperand::createExpr(GotDispRelocExpr), IDLoc, Instructions);
+ }
+ } else {
+ // If it's an external/weak symbol, we expand to:
+ // lw/ld $25, 0($gp)
+ // R_(MICRO)MIPS_CALL16 label
+ // jalr $25
+ const MCExpr *Call16RelocExpr = evaluateRelocExpr(JalExpr, "call16");
+
+ emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+ MCOperand::createExpr(Call16RelocExpr), IDLoc, Instructions);
+ }
+
+ MCInst JalrInst;
+ if (IsCpRestoreSet && inMicroMipsMode())
+ JalrInst.setOpcode(Mips::JALRS_MM);
+ else
+ JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+ JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+ JalrInst.addOperand(MCOperand::createReg(Mips::T9));
+
+ // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR.
+ // This relocation is supposed to be an optimization hint for the linker
+ // and is not necessary for correctness.
+
+ Inst = JalrInst;
+ ExpandedJalSym = true;
+ }
+
if (MCID.mayLoad() || MCID.mayStore()) {
// Check the offset of memory operand, if it is a symbol
// reference or immediate we may have to expand instructions.
@@ -1500,17 +1757,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
int MemOffset = Op.getImm();
MCOperand &DstReg = Inst.getOperand(0);
MCOperand &BaseReg = Inst.getOperand(1);
- if (isIntN(9, MemOffset) && (MemOffset % 4 == 0) &&
+ if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) &&
getContext().getRegisterInfo()->getRegClass(
Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
- BaseReg.getReg() == Mips::GP) {
- MCInst TmpInst;
- TmpInst.setLoc(IDLoc);
- TmpInst.setOpcode(Mips::LWGP_MM);
- TmpInst.addOperand(MCOperand::createReg(DstReg.getReg()));
- TmpInst.addOperand(MCOperand::createReg(Mips::GP));
- TmpInst.addOperand(MCOperand::createImm(MemOffset));
- Instructions.push_back(TmpInst);
+ (BaseReg.getReg() == Mips::GP ||
+ BaseReg.getReg() == Mips::GP_64)) {
+
+ emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
+ IDLoc, Instructions);
return false;
}
}
@@ -1597,7 +1851,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
if (Imm < -1 || Imm > 14)
return Error(IDLoc, "immediate operand value out of range");
break;
+ case Mips::TEQ_MM:
+ case Mips::TGE_MM:
+ case Mips::TGEU_MM:
+ case Mips::TLT_MM:
+ case Mips::TLTU_MM:
+ case Mips::TNE_MM:
case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
Opnd = Inst.getOperand(2);
if (!Opnd.isImm())
return Error(IDLoc, "expected immediate operand kind");
@@ -1607,6 +1868,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break;
case Mips::LHU16_MM:
case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
Opnd = Inst.getOperand(2);
if (!Opnd.isImm())
return Error(IDLoc, "expected immediate operand kind");
@@ -1616,6 +1878,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break;
case Mips::LW16_MM:
case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
Opnd = Inst.getOperand(2);
if (!Opnd.isImm())
return Error(IDLoc, "expected immediate operand kind");
@@ -1623,93 +1886,111 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
if (Imm < 0 || Imm > 60 || (Imm % 4 != 0))
return Error(IDLoc, "immediate operand value out of range");
break;
- case Mips::CACHE:
- case Mips::PREF:
- Opnd = Inst.getOperand(2);
- if (!Opnd.isImm())
- return Error(IDLoc, "expected immediate operand kind");
- Imm = Opnd.getImm();
- if (!isUInt<5>(Imm))
- return Error(IDLoc, "immediate operand value out of range");
- break;
case Mips::ADDIUPC_MM:
MCOperand Opnd = Inst.getOperand(1);
if (!Opnd.isImm())
return Error(IDLoc, "expected immediate operand kind");
int Imm = Opnd.getImm();
- if ((Imm % 4 != 0) || !isIntN(25, Imm))
+ if ((Imm % 4 != 0) || !isInt<25>(Imm))
return Error(IDLoc, "immediate operand value out of range");
break;
}
}
- if (needsExpansion(Inst)) {
- if (expandInstruction(Inst, IDLoc, Instructions))
- return true;
- } else
+ MacroExpanderResultTy ExpandResult =
+ tryExpandInstruction(Inst, IDLoc, Instructions);
+ switch (ExpandResult) {
+ case MER_NotAMacro:
Instructions.push_back(Inst);
+ break;
+ case MER_Success:
+ break;
+ case MER_Fail:
+ return true;
+ }
// If this instruction has a delay slot and .set reorder is active,
// emit a NOP after it.
if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
- return false;
-}
+ if ((Inst.getOpcode() == Mips::JalOneReg ||
+ Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) &&
+ isPicAndNotNxxAbi()) {
+ if (IsCpRestoreSet) {
+ // We need a NOP between the JALR and the LW:
+ // If .set reorder has been used, we've already emitted a NOP.
+ // If .set noreorder has been used, we need to emit a NOP at this point.
+ if (!AssemblerOptions.back()->isReorder())
+ createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions);
-bool MipsAsmParser::needsExpansion(MCInst &Inst) {
+ // Load the $gp from the stack.
+ SmallVector<MCInst, 3> LoadInsts;
+ createCpRestoreMemOp(true /*IsLoad*/, CpRestoreOffset /*StackOffset*/,
+ IDLoc, LoadInsts);
- switch (Inst.getOpcode()) {
- case Mips::LoadImm32:
- case Mips::LoadImm64:
- case Mips::LoadAddrImm32:
- case Mips::LoadAddrReg32:
- case Mips::B_MM_Pseudo:
- case Mips::LWM_MM:
- case Mips::SWM_MM:
- case Mips::JalOneReg:
- case Mips::JalTwoReg:
- case Mips::BneImm:
- case Mips::BeqImm:
- case Mips::BLT:
- case Mips::BLE:
- case Mips::BGE:
- case Mips::BGT:
- case Mips::BLTU:
- case Mips::BLEU:
- case Mips::BGEU:
- case Mips::BGTU:
- case Mips::Ulhu:
- case Mips::Ulw:
- return true;
- default:
- return false;
+ for (const MCInst &Inst : LoadInsts)
+ Instructions.push_back(Inst);
+
+ } else
+ Warning(IDLoc, "no .cprestore used in PIC mode");
}
+
+ return false;
}
-bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
+MipsAsmParser::MacroExpanderResultTy
+MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
switch (Inst.getOpcode()) {
- default: llvm_unreachable("unimplemented expansion");
+ default:
+ return MER_NotAMacro;
case Mips::LoadImm32:
- return expandLoadImm(Inst, true, IDLoc, Instructions);
+ return expandLoadImm(Inst, true, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
case Mips::LoadImm64:
- return expandLoadImm(Inst, false, IDLoc, Instructions);
+ return expandLoadImm(Inst, false, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
case Mips::LoadAddrImm32:
- return expandLoadAddressImm(Inst, true, IDLoc, Instructions);
+ case Mips::LoadAddrImm64:
+ assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+ assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) &&
+ "expected immediate operand kind");
+
+ return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
+ Inst.getOperand(1),
+ Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
+ Instructions)
+ ? MER_Fail
+ : MER_Success;
case Mips::LoadAddrReg32:
- return expandLoadAddressReg(Inst, true, IDLoc, Instructions);
+ case Mips::LoadAddrReg64:
+ assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+ assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+ assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) &&
+ "expected immediate operand kind");
+
+ return expandLoadAddress(Inst.getOperand(0).getReg(),
+ Inst.getOperand(1).getReg(), Inst.getOperand(2),
+ Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
+ Instructions)
+ ? MER_Fail
+ : MER_Success;
case Mips::B_MM_Pseudo:
- return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions);
+ case Mips::B_MMR6_Pseudo:
+ return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
case Mips::SWM_MM:
case Mips::LWM_MM:
- return expandLoadStoreMultiple(Inst, IDLoc, Instructions);
+ return expandLoadStoreMultiple(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
case Mips::JalOneReg:
case Mips::JalTwoReg:
- return expandJalWithRegs(Inst, IDLoc, Instructions);
+ return expandJalWithRegs(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
case Mips::BneImm:
case Mips::BeqImm:
- return expandBranchImm(Inst, IDLoc, Instructions);
+ return expandBranchImm(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
case Mips::BLT:
case Mips::BLE:
case Mips::BGE:
@@ -1718,78 +1999,97 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
case Mips::BLEU:
case Mips::BGEU:
case Mips::BGTU:
- return expandCondBranches(Inst, IDLoc, Instructions);
+ case Mips::BLTL:
+ case Mips::BLEL:
+ case Mips::BGEL:
+ case Mips::BGTL:
+ case Mips::BLTUL:
+ case Mips::BLEUL:
+ case Mips::BGEUL:
+ case Mips::BGTUL:
+ case Mips::BLTImmMacro:
+ case Mips::BLEImmMacro:
+ case Mips::BGEImmMacro:
+ case Mips::BGTImmMacro:
+ case Mips::BLTUImmMacro:
+ case Mips::BLEUImmMacro:
+ case Mips::BGEUImmMacro:
+ case Mips::BGTUImmMacro:
+ case Mips::BLTLImmMacro:
+ case Mips::BLELImmMacro:
+ case Mips::BGELImmMacro:
+ case Mips::BGTLImmMacro:
+ case Mips::BLTULImmMacro:
+ case Mips::BLEULImmMacro:
+ case Mips::BGEULImmMacro:
+ case Mips::BGTULImmMacro:
+ return expandCondBranches(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ case Mips::SDivMacro:
+ return expandDiv(Inst, IDLoc, Instructions, false, true) ? MER_Fail
+ : MER_Success;
+ case Mips::DSDivMacro:
+ return expandDiv(Inst, IDLoc, Instructions, true, true) ? MER_Fail
+ : MER_Success;
+ case Mips::UDivMacro:
+ return expandDiv(Inst, IDLoc, Instructions, false, false) ? MER_Fail
+ : MER_Success;
+ case Mips::DUDivMacro:
+ return expandDiv(Inst, IDLoc, Instructions, true, false) ? MER_Fail
+ : MER_Success;
+ case Mips::Ulh:
+ return expandUlh(Inst, true, IDLoc, Instructions) ? MER_Fail : MER_Success;
case Mips::Ulhu:
- return expandUlhu(Inst, IDLoc, Instructions);
+ return expandUlh(Inst, false, IDLoc, Instructions) ? MER_Fail : MER_Success;
case Mips::Ulw:
- return expandUlw(Inst, IDLoc, Instructions);
+ return expandUlw(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success;
+ case Mips::NORImm:
+ return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ case Mips::ADDi:
+ case Mips::ADDiu:
+ case Mips::SLTi:
+ case Mips::SLTiu:
+ if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ if (isInt<16>(ImmValue))
+ return MER_NotAMacro;
+ return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ }
+ return MER_NotAMacro;
+ case Mips::ANDi:
+ case Mips::ORi:
+ case Mips::XORi:
+ if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+ if (isUInt<16>(ImmValue))
+ return MER_NotAMacro;
+ return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ }
+ return MER_NotAMacro;
+ case Mips::ROL:
+ case Mips::ROR:
+ return expandRotation(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ case Mips::ROLImm:
+ case Mips::RORImm:
+ return expandRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ case Mips::DROL:
+ case Mips::DROR:
+ return expandDRotation(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
+ case Mips::DROLImm:
+ case Mips::DRORImm:
+ return expandDRotationImm(Inst, IDLoc, Instructions) ? MER_Fail
+ : MER_Success;
}
}
-namespace {
-void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- MCInst tmpInst;
- tmpInst.setOpcode(Opcode);
- tmpInst.addOperand(MCOperand::createReg(DstReg));
- tmpInst.addOperand(Imm);
- tmpInst.setLoc(IDLoc);
- Instructions.push_back(tmpInst);
-}
-
-void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions);
-}
-
-
-void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm,
- SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
- MCInst tmpInst;
- tmpInst.setOpcode(Opcode);
- tmpInst.addOperand(MCOperand::createReg(DstReg));
- tmpInst.addOperand(MCOperand::createReg(SrcReg));
- tmpInst.addOperand(Imm);
- tmpInst.setLoc(IDLoc);
- Instructions.push_back(tmpInst);
-}
-
-void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg,
- unsigned SrcReg2, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc,
- Instructions);
-}
-
-void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm,
- SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
- emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc,
- Instructions);
-}
-
-template <int16_t ShiftAmount>
-void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- if (ShiftAmount >= 32)
- emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions);
- else if (ShiftAmount > 0)
- emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions);
-
- // There's no need for an ORi if the immediate is 0.
- if (Operand.isImm() && Operand.getImm() == 0)
- return;
-
- emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions);
-}
-
-template <unsigned ShiftAmount>
-void createLShiftOri(int64_t Value, unsigned RegNo, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- createLShiftOri<ShiftAmount>(MCOperand::createImm(Value), RegNo, IDLoc,
- Instructions);
-}
-}
-
bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions) {
// Create a JALR instruction which is going to replace the pseudo-JAL.
@@ -1800,8 +2100,11 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
if (Opcode == Mips::JalOneReg) {
// jal $rs => jalr $rs
- if (inMicroMipsMode()) {
- JalrInst.setOpcode(Mips::JALR16_MM);
+ if (IsCpRestoreSet && inMicroMipsMode()) {
+ JalrInst.setOpcode(Mips::JALRS16_MM);
+ JalrInst.addOperand(FirstRegOp);
+ } else if (inMicroMipsMode()) {
+ JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM);
JalrInst.addOperand(FirstRegOp);
} else {
JalrInst.setOpcode(Mips::JALR);
@@ -1810,30 +2113,47 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
}
} else if (Opcode == Mips::JalTwoReg) {
// jal $rd, $rs => jalr $rd, $rs
- JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+ if (IsCpRestoreSet && inMicroMipsMode())
+ JalrInst.setOpcode(Mips::JALRS_MM);
+ else
+ JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
JalrInst.addOperand(FirstRegOp);
const MCOperand SecondRegOp = Inst.getOperand(1);
JalrInst.addOperand(SecondRegOp);
}
Instructions.push_back(JalrInst);
- // If .set reorder is active, emit a NOP after it.
- if (AssemblerOptions.back()->isReorder()) {
- // This is a 32-bit NOP because these 2 pseudo-instructions
- // do not have a short delay slot.
- MCInst NopInst;
- NopInst.setOpcode(Mips::SLL);
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- NopInst.addOperand(MCOperand::createImm(0));
- Instructions.push_back(NopInst);
+ // If .set reorder is active and branch instruction has a delay slot,
+ // emit a NOP after it.
+ const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
+ if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) {
+ createNop(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc, Instructions);
}
return false;
}
+/// Can the value be represented by a unsigned N-bit value and a shift left?
+template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
+ unsigned BitNum = findFirstSet(x);
+
+ return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum);
+}
+
+/// Load (or add) an immediate into a register.
+///
+/// @param ImmValue The immediate to load.
+/// @param DstReg The register that will hold the immediate.
+/// @param SrcReg A register to add to the immediate or Mips::NoRegister
+/// for a simple initialization.
+/// @param Is32BitImm Is ImmValue 32-bit or 64-bit?
+/// @param IsAddress True if the immediate represents an address. False if it
+/// is an integer.
+/// @param IDLoc Location of the immediate in the source file.
+/// @param Instructions The instructions emitted by this expansion.
bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
- unsigned SrcReg, bool Is32BitImm, SMLoc IDLoc,
+ unsigned SrcReg, bool Is32BitImm,
+ bool IsAddress, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions) {
if (!Is32BitImm && !isGP64bit()) {
Error(IDLoc, "instruction requires a 64-bit architecture");
@@ -1852,6 +2172,9 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
}
}
+ unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
+ unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;
+
bool UseSrcReg = false;
if (SrcReg != Mips::NoRegister)
UseSrcReg = true;
@@ -1866,111 +2189,129 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
TmpReg = ATReg;
}
- // FIXME: gas has a special case for values that are 000...1111, which
- // becomes a li -1 and then a dsrl
if (isInt<16>(ImmValue)) {
- // li d,j => addiu d,$zero,j
if (!UseSrcReg)
- SrcReg = Mips::ZERO;
+ SrcReg = ZeroReg;
+
+ // This doesn't quite follow the usual ABI expectations for N32 but matches
+ // traditional assembler behaviour. N32 would normally use addiu for both
+ // integers and addresses.
+ if (IsAddress && !Is32BitImm) {
+ emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+ return false;
+ }
+
emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
- } else if (isUInt<16>(ImmValue)) {
- // li d,j => ori d,$zero,j
+ return false;
+ }
+
+ if (isUInt<16>(ImmValue)) {
unsigned TmpReg = DstReg;
if (SrcReg == DstReg) {
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
return true;
- TmpReg = ATReg;
}
- emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions);
+ emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, Instructions);
if (UseSrcReg)
- emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
- } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+ emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+ return false;
+ }
+
+ if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
warnIfNoMacro(IDLoc);
- // For all other values which are representable as a 32-bit integer:
- // li d,j => lui d,hi16(j)
- // ori d,d,lo16(j)
uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
uint16_t Bits15To0 = ImmValue & 0xffff;
if (!Is32BitImm && !isInt<32>(ImmValue)) {
- // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the
+ // Traditional behaviour seems to special case this particular value. It's
+ // not clear why other masks are handled differently.
+ if (ImmValue == 0xffffffff) {
+ emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, Instructions);
+ emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, Instructions);
+ if (UseSrcReg)
+ emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+ return false;
+ }
+
+ // Expand to an ORi instead of a LUi to avoid sign-extending into the
// upper 32 bits.
- emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions);
+ emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, Instructions);
emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions);
- } else
- emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
- createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions);
+ if (Bits15To0)
+ emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
+ if (UseSrcReg)
+ emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+ return false;
+ }
+ emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
+ if (Bits15To0)
+ emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions);
if (UseSrcReg)
- createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
-
- } else if ((ImmValue & (0xffffLL << 48)) == 0) {
- warnIfNoMacro(IDLoc);
+ emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+ return false;
+ }
- // <------- lo32 ------>
- // <------- hi32 ------>
- // <- hi16 -> <- lo16 ->
- // _________________________________
- // | | | |
- // | 16-bits | 16-bits | 16-bits |
- // |__________|__________|__________|
- //
- // For any 64-bit value that is representable as a 48-bit integer:
- // li d,j => lui d,hi16(j)
- // ori d,d,hi16(lo32(j))
- // dsll d,d,16
- // ori d,d,lo16(lo32(j))
- uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff;
- uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
- uint16_t Bits15To0 = ImmValue & 0xffff;
+ if (isShiftedUIntAtAnyPosition<16>(ImmValue)) {
+ if (Is32BitImm) {
+ Error(IDLoc, "instruction requires a 32-bit immediate");
+ return true;
+ }
- emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions);
- createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions);
- createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
+ // Traditionally, these immediates are shifted as little as possible and as
+ // such we align the most significant bit to bit 15 of our temporary.
+ unsigned FirstSet = findFirstSet((uint64_t)ImmValue);
+ unsigned LastSet = findLastSet((uint64_t)ImmValue);
+ unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
+ uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
+ emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, Instructions);
+ emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, Instructions);
if (UseSrcReg)
- createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
+ emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
- } else {
- warnIfNoMacro(IDLoc);
+ return false;
+ }
- // <------- hi32 ------> <------- lo32 ------>
- // <- hi16 -> <- lo16 ->
- // ___________________________________________
- // | | | | |
- // | 16-bits | 16-bits | 16-bits | 16-bits |
- // |__________|__________|__________|__________|
- //
- // For all other values which are representable as a 64-bit integer:
- // li d,j => lui d,hi16(j)
- // ori d,d,lo16(hi32(j))
- // dsll d,d,16
- // ori d,d,hi16(lo32(j))
- // dsll d,d,16
- // ori d,d,lo16(lo32(j))
- uint16_t Bits63To48 = (ImmValue >> 48) & 0xffff;
- uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff;
- uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
- uint16_t Bits15To0 = ImmValue & 0xffff;
+ warnIfNoMacro(IDLoc);
- emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions);
- createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions);
+ // The remaining case is packed with a sequence of dsll and ori with zeros
+ // being omitted and any neighbouring dsll's being coalesced.
+ // The highest 32-bit's are equivalent to a 32-bit immediate load.
- // When Bits31To16 is 0, do a left shift of 32 bits instead of doing
- // two left shifts of 16 bits.
- if (Bits31To16 == 0) {
- createLShiftOri<32>(Bits15To0, TmpReg, IDLoc, Instructions);
- } else {
- createLShiftOri<16>(Bits31To16, TmpReg, IDLoc, Instructions);
- createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
+ // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
+ if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
+ IDLoc, Instructions))
+ return false;
+
+ // Shift and accumulate into the register. If a 16-bit chunk is zero, then
+ // skip it and defer the shift to the next chunk.
+ unsigned ShiftCarriedForwards = 16;
+ for (int BitNum = 16; BitNum >= 0; BitNum -= 16) {
+ uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;
+
+ if (ImmChunk != 0) {
+ emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
+ Instructions);
+ emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, Instructions);
+ ShiftCarriedForwards = 0;
}
- if (UseSrcReg)
- createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
+ ShiftCarriedForwards += 16;
}
+ ShiftCarriedForwards -= 16;
+
+ // Finish any remaining shifts left by trailing zeros.
+ if (ShiftCarriedForwards)
+ emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc,
+ Instructions);
+
+ if (UseSrcReg)
+ emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+
return false;
}
@@ -1982,63 +2323,38 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
assert(DstRegOp.isReg() && "expected register operand kind");
if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
- Is32BitImm, IDLoc, Instructions))
+ Is32BitImm, false, IDLoc, Instructions))
return true;
return false;
}
-bool
-MipsAsmParser::expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- const MCOperand &DstRegOp = Inst.getOperand(0);
- assert(DstRegOp.isReg() && "expected register operand kind");
-
- const MCOperand &SrcRegOp = Inst.getOperand(1);
- assert(SrcRegOp.isReg() && "expected register operand kind");
-
- const MCOperand &ImmOp = Inst.getOperand(2);
- assert((ImmOp.isImm() || ImmOp.isExpr()) &&
- "expected immediate operand kind");
- if (!ImmOp.isImm()) {
- if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(),
- SrcRegOp.getReg(), Is32BitImm, IDLoc,
- Instructions))
- return true;
-
- return false;
- }
-
- if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), SrcRegOp.getReg(),
- Is32BitImm, IDLoc, Instructions))
+bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+ const MCOperand &Offset,
+ bool Is32BitAddress, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ // la can't produce a usable address when addresses are 64-bit.
+ if (Is32BitAddress && ABI.ArePtrs64bit()) {
+ // FIXME: Demote this to a warning and continue as if we had 'dla' instead.
+ // We currently can't do this because we depend on the equality
+ // operator and N64 can end up with a GPR32/GPR64 mismatch.
+ Error(IDLoc, "la used to load 64-bit address");
+ // Continue as if we had 'dla' instead.
+ Is32BitAddress = false;
+ }
+
+ // dla requires 64-bit addresses.
+ if (!Is32BitAddress && !ABI.ArePtrs64bit()) {
+ Error(IDLoc, "instruction requires a 64-bit architecture");
return true;
-
- return false;
-}
-
-bool
-MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
- const MCOperand &DstRegOp = Inst.getOperand(0);
- assert(DstRegOp.isReg() && "expected register operand kind");
-
- const MCOperand &ImmOp = Inst.getOperand(1);
- assert((ImmOp.isImm() || ImmOp.isExpr()) &&
- "expected immediate operand kind");
- if (!ImmOp.isImm()) {
- if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(),
- Mips::NoRegister, Is32BitImm, IDLoc,
- Instructions))
- return true;
-
- return false;
}
- if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
- Is32BitImm, IDLoc, Instructions))
- return true;
+ if (!Offset.isImm())
+ return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
+ Is32BitAddress, IDLoc, Instructions);
- return false;
+ return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
+ IDLoc, Instructions);
}
bool MipsAsmParser::loadAndAddSymbolAddress(
@@ -2046,67 +2362,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
warnIfNoMacro(IDLoc);
- if (Is32BitSym && isABI_N64())
- Warning(IDLoc, "instruction loads the 32-bit address of a 64-bit symbol");
-
- MCInst tmpInst;
- const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymExpr);
- const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create(
- &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_HI, getContext());
- const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create(
- &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_LO, getContext());
+ const MCExpr *Symbol = cast<MCExpr>(SymExpr);
+ const MipsMCExpr *HiExpr = MipsMCExpr::create(
+ MCSymbolRefExpr::VK_Mips_ABS_HI, Symbol, getContext());
+ const MipsMCExpr *LoExpr = MipsMCExpr::create(
+ MCSymbolRefExpr::VK_Mips_ABS_LO, Symbol, getContext());
bool UseSrcReg = SrcReg != Mips::NoRegister;
+ // This is the 64-bit symbol address expansion.
+ if (ABI.ArePtrs64bit() && isGP64bit()) {
+ // We always need AT for the 64-bit expansion.
+ // If it is not available we exit.
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ const MipsMCExpr *HighestExpr = MipsMCExpr::create(
+ MCSymbolRefExpr::VK_Mips_HIGHEST, Symbol, getContext());
+ const MipsMCExpr *HigherExpr = MipsMCExpr::create(
+ MCSymbolRefExpr::VK_Mips_HIGHER, Symbol, getContext());
+
+ if (UseSrcReg && (DstReg == SrcReg)) {
+ // If $rs is the same as $rd:
+ // (d)la $rd, sym($rd) => lui $at, %highest(sym)
+ // daddiu $at, $at, %higher(sym)
+ // dsll $at, $at, 16
+ // daddiu $at, $at, %hi(sym)
+ // dsll $at, $at, 16
+ // daddiu $at, $at, %lo(sym)
+ // daddu $rd, $at, $rd
+ emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+ Instructions);
+ emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr),
+ IDLoc, Instructions);
+ emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
+ emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
+ Instructions);
+ emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions);
+ emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
+ Instructions);
+ emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, Instructions);
+
+ return false;
+ }
+
+ // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+ // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym)
+ // lui $at, %hi(sym)
+ // daddiu $rd, $rd, %higher(sym)
+ // daddiu $at, $at, %lo(sym)
+ // dsll32 $rd, $rd, 0
+ // daddu $rd, $rd, $at
+ // (daddu $rd, $rd, $rs)
+ emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+ Instructions);
+ emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc,
+ Instructions);
+ emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr),
+ IDLoc, Instructions);
+ emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc,
+ Instructions);
+ emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, Instructions);
+ emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, Instructions);
+ if (UseSrcReg)
+ emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+
+ return false;
+ }
+
+ // And now, the 32-bit symbol address expansion:
+ // If $rs is the same as $rd:
+ // (d)la $rd, sym($rd) => lui $at, %hi(sym)
+ // ori $at, $at, %lo(sym)
+ // addu $rd, $at, $rd
+ // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+ // (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym)
+ // ori $rd, $rd, %lo(sym)
+ // (addu $rd, $rd, $rs)
unsigned TmpReg = DstReg;
if (UseSrcReg && (DstReg == SrcReg)) {
- // At this point we need AT to perform the expansions and we exit if it is
- // not available.
+ // If $rs is the same as $rd, we need to use AT.
+ // If it is not available we exit.
unsigned ATReg = getATReg(IDLoc);
if (!ATReg)
return true;
TmpReg = ATReg;
}
- if (!Is32BitSym) {
- // If it's a 64-bit architecture, expand to:
- // la d,sym => lui d,highest(sym)
- // ori d,d,higher(sym)
- // dsll d,d,16
- // ori d,d,hi16(sym)
- // dsll d,d,16
- // ori d,d,lo16(sym)
- const MCSymbolRefExpr *HighestExpr = MCSymbolRefExpr::create(
- &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHEST, getContext());
- const MCSymbolRefExpr *HigherExpr = MCSymbolRefExpr::create(
- &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHER, getContext());
-
- tmpInst.setOpcode(Mips::LUi);
- tmpInst.addOperand(MCOperand::createReg(TmpReg));
- tmpInst.addOperand(MCOperand::createExpr(HighestExpr));
- Instructions.push_back(tmpInst);
-
- createLShiftOri<0>(MCOperand::createExpr(HigherExpr), TmpReg, SMLoc(),
- Instructions);
- createLShiftOri<16>(MCOperand::createExpr(HiExpr), TmpReg, SMLoc(),
- Instructions);
- createLShiftOri<16>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(),
- Instructions);
- } else {
- // Otherwise, expand to:
- // la d,sym => lui d,hi16(sym)
- // ori d,d,lo16(sym)
- tmpInst.setOpcode(Mips::LUi);
- tmpInst.addOperand(MCOperand::createReg(TmpReg));
- tmpInst.addOperand(MCOperand::createExpr(HiExpr));
- Instructions.push_back(tmpInst);
-
- emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(),
- Instructions);
- }
+ emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, Instructions);
+ emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc,
+ Instructions);
if (UseSrcReg)
- createAddu(DstReg, TmpReg, SrcReg, !Is32BitSym, Instructions);
+ emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
+ else
+ assert(DstReg == TmpReg);
return false;
}
@@ -2125,12 +2476,13 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(
Inst.addOperand(MCOperand::createExpr(Offset.getExpr()));
} else {
assert(Offset.isImm() && "expected immediate operand kind");
- if (isIntN(11, Offset.getImm())) {
+ if (isInt<11>(Offset.getImm())) {
// If offset fits into 11 bits then this instruction becomes microMIPS
// 16-bit unconditional branch instruction.
- Inst.setOpcode(Mips::B16_MM);
+ if (inMicroMipsMode())
+ Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM);
} else {
- if (!isIntN(17, Offset.getImm()))
+ if (!isInt<17>(Offset.getImm()))
Error(IDLoc, "branch target out of range");
if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
Error(IDLoc, "branch to misaligned address");
@@ -2143,8 +2495,10 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(
}
Instructions.push_back(Inst);
- // If .set reorder is active, emit a NOP after the branch instruction.
- if (AssemblerOptions.back()->isReorder())
+ // If .set reorder is active and branch instruction has a delay slot,
+ // emit a NOP after it.
+ const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+ if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
createNop(true, IDLoc, Instructions);
return false;
@@ -2175,30 +2529,21 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
}
int64_t ImmValue = ImmOp.getImm();
- if (ImmValue == 0) {
- MCInst BranchInst;
- BranchInst.setOpcode(OpCode);
- BranchInst.addOperand(DstRegOp);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MemOffsetOp);
- Instructions.push_back(BranchInst);
- } else {
+ if (ImmValue == 0)
+ emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+ Instructions);
+ else {
warnIfNoMacro(IDLoc);
unsigned ATReg = getATReg(IDLoc);
if (!ATReg)
return true;
- if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), IDLoc,
- Instructions))
+ if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
+ IDLoc, Instructions))
return true;
- MCInst BranchInst;
- BranchInst.setOpcode(OpCode);
- BranchInst.addOperand(DstRegOp);
- BranchInst.addOperand(MCOperand::createReg(ATReg));
- BranchInst.addOperand(MemOffsetOp);
- Instructions.push_back(BranchInst);
+ emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, Instructions);
}
return false;
}
@@ -2206,7 +2551,6 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc,
void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions,
bool isLoad, bool isImmOpnd) {
- MCInst TempInst;
unsigned ImmOffset, HiOffset, LoOffset;
const MCExpr *ExprOffset;
unsigned TmpRegNum;
@@ -2227,8 +2571,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
HiOffset++;
} else
ExprOffset = Inst.getOperand(2).getExpr();
- // All instructions will have the same location.
- TempInst.setLoc(IDLoc);
// These are some of the types of expansions we perform here:
// 1) lw $8, sym => lui $8, %hi(sym)
// lw $8, %lo(sym)($8)
@@ -2267,40 +2609,20 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
return;
}
- TempInst.setOpcode(Mips::LUi);
- TempInst.addOperand(MCOperand::createReg(TmpRegNum));
- if (isImmOpnd)
- TempInst.addOperand(MCOperand::createImm(HiOffset));
- else {
- const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi");
- TempInst.addOperand(MCOperand::createExpr(HiExpr));
- }
- // Add the instruction to the list.
- Instructions.push_back(TempInst);
- // Prepare TempInst for next instruction.
- TempInst.clear();
+ emitRX(Mips::LUi, TmpRegNum,
+ isImmOpnd ? MCOperand::createImm(HiOffset)
+ : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "hi")),
+ IDLoc, Instructions);
// Add temp register to base.
- if (BaseRegNum != Mips::ZERO) {
- TempInst.setOpcode(Mips::ADDu);
- TempInst.addOperand(MCOperand::createReg(TmpRegNum));
- TempInst.addOperand(MCOperand::createReg(TmpRegNum));
- TempInst.addOperand(MCOperand::createReg(BaseRegNum));
- Instructions.push_back(TempInst);
- TempInst.clear();
- }
+ if (BaseRegNum != Mips::ZERO)
+ emitRRR(Mips::ADDu, TmpRegNum, TmpRegNum, BaseRegNum, IDLoc, Instructions);
// And finally, create original instruction with low part
// of offset and new base.
- TempInst.setOpcode(Inst.getOpcode());
- TempInst.addOperand(MCOperand::createReg(RegOpNum));
- TempInst.addOperand(MCOperand::createReg(TmpRegNum));
- if (isImmOpnd)
- TempInst.addOperand(MCOperand::createImm(LoOffset));
- else {
- const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo");
- TempInst.addOperand(MCOperand::createExpr(LoExpr));
- }
- Instructions.push_back(TempInst);
- TempInst.clear();
+ emitRRX(Inst.getOpcode(), RegOpNum, TmpRegNum,
+ isImmOpnd
+ ? MCOperand::createImm(LoOffset)
+ : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "lo")),
+ IDLoc, Instructions);
}
bool
@@ -2316,10 +2638,16 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
Inst.getOperand(OpNum - 1).getImm() >= 0 &&
- Inst.getOperand(OpNum - 2).getReg() == Mips::SP &&
- Inst.getOperand(OpNum - 3).getReg() == Mips::RA)
+ (Inst.getOperand(OpNum - 2).getReg() == Mips::SP ||
+ Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) &&
+ (Inst.getOperand(OpNum - 3).getReg() == Mips::RA ||
+ Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) {
// It can be implemented as SWM16 or LWM16 instruction.
- NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+ if (inMicroMipsMode() && hasMips32r6())
+ NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6;
+ else
+ NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+ }
Inst.setOpcode(NewOpcode);
Instructions.push_back(Inst);
@@ -2328,44 +2656,126 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions) {
+ bool EmittedNoMacroWarning = false;
unsigned PseudoOpcode = Inst.getOpcode();
unsigned SrcReg = Inst.getOperand(0).getReg();
- unsigned TrgReg = Inst.getOperand(1).getReg();
+ const MCOperand &TrgOp = Inst.getOperand(1);
const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();
unsigned ZeroSrcOpcode, ZeroTrgOpcode;
- bool ReverseOrderSLT, IsUnsigned, AcceptsEquality;
+ bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;
+
+ unsigned TrgReg;
+ if (TrgOp.isReg())
+ TrgReg = TrgOp.getReg();
+ else if (TrgOp.isImm()) {
+ warnIfNoMacro(IDLoc);
+ EmittedNoMacroWarning = true;
+
+ TrgReg = getATReg(IDLoc);
+ if (!TrgReg)
+ return true;
+
+ switch(PseudoOpcode) {
+ default:
+ llvm_unreachable("unknown opcode for branch pseudo-instruction");
+ case Mips::BLTImmMacro:
+ PseudoOpcode = Mips::BLT;
+ break;
+ case Mips::BLEImmMacro:
+ PseudoOpcode = Mips::BLE;
+ break;
+ case Mips::BGEImmMacro:
+ PseudoOpcode = Mips::BGE;
+ break;
+ case Mips::BGTImmMacro:
+ PseudoOpcode = Mips::BGT;
+ break;
+ case Mips::BLTUImmMacro:
+ PseudoOpcode = Mips::BLTU;
+ break;
+ case Mips::BLEUImmMacro:
+ PseudoOpcode = Mips::BLEU;
+ break;
+ case Mips::BGEUImmMacro:
+ PseudoOpcode = Mips::BGEU;
+ break;
+ case Mips::BGTUImmMacro:
+ PseudoOpcode = Mips::BGTU;
+ break;
+ case Mips::BLTLImmMacro:
+ PseudoOpcode = Mips::BLTL;
+ break;
+ case Mips::BLELImmMacro:
+ PseudoOpcode = Mips::BLEL;
+ break;
+ case Mips::BGELImmMacro:
+ PseudoOpcode = Mips::BGEL;
+ break;
+ case Mips::BGTLImmMacro:
+ PseudoOpcode = Mips::BGTL;
+ break;
+ case Mips::BLTULImmMacro:
+ PseudoOpcode = Mips::BLTUL;
+ break;
+ case Mips::BLEULImmMacro:
+ PseudoOpcode = Mips::BLEUL;
+ break;
+ case Mips::BGEULImmMacro:
+ PseudoOpcode = Mips::BGEUL;
+ break;
+ case Mips::BGTULImmMacro:
+ PseudoOpcode = Mips::BGTUL;
+ break;
+ }
+
+ if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
+ false, IDLoc, Instructions))
+ return true;
+ }
switch (PseudoOpcode) {
case Mips::BLT:
case Mips::BLTU:
+ case Mips::BLTL:
+ case Mips::BLTUL:
AcceptsEquality = false;
ReverseOrderSLT = false;
- IsUnsigned = (PseudoOpcode == Mips::BLTU);
+ IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+ IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
ZeroSrcOpcode = Mips::BGTZ;
ZeroTrgOpcode = Mips::BLTZ;
break;
case Mips::BLE:
case Mips::BLEU:
+ case Mips::BLEL:
+ case Mips::BLEUL:
AcceptsEquality = true;
ReverseOrderSLT = true;
- IsUnsigned = (PseudoOpcode == Mips::BLEU);
+ IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+ IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
ZeroSrcOpcode = Mips::BGEZ;
ZeroTrgOpcode = Mips::BLEZ;
break;
case Mips::BGE:
case Mips::BGEU:
+ case Mips::BGEL:
+ case Mips::BGEUL:
AcceptsEquality = true;
ReverseOrderSLT = false;
- IsUnsigned = (PseudoOpcode == Mips::BGEU);
+ IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+ IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
ZeroSrcOpcode = Mips::BLEZ;
ZeroTrgOpcode = Mips::BGEZ;
break;
case Mips::BGT:
case Mips::BGTU:
+ case Mips::BGTL:
+ case Mips::BGTUL:
AcceptsEquality = false;
ReverseOrderSLT = true;
- IsUnsigned = (PseudoOpcode == Mips::BGTU);
+ IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+ IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
ZeroSrcOpcode = Mips::BLTZ;
ZeroTrgOpcode = Mips::BGTZ;
break;
@@ -2373,7 +2783,6 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
llvm_unreachable("unknown opcode for branch pseudo-instruction");
}
- MCInst BranchInst;
bool IsTrgRegZero = (TrgReg == Mips::ZERO);
bool IsSrcRegZero = (SrcReg == Mips::ZERO);
if (IsSrcRegZero && IsTrgRegZero) {
@@ -2381,51 +2790,37 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
// with GAS' behaviour. However, they may not generate the most efficient
// code in some circumstances.
if (PseudoOpcode == Mips::BLT) {
- BranchInst.setOpcode(Mips::BLTZ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ Instructions);
return false;
}
if (PseudoOpcode == Mips::BLE) {
- BranchInst.setOpcode(Mips::BLEZ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ Instructions);
Warning(IDLoc, "branch is always taken");
return false;
}
if (PseudoOpcode == Mips::BGE) {
- BranchInst.setOpcode(Mips::BGEZ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ Instructions);
Warning(IDLoc, "branch is always taken");
return false;
}
if (PseudoOpcode == Mips::BGT) {
- BranchInst.setOpcode(Mips::BGTZ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ Instructions);
return false;
}
if (PseudoOpcode == Mips::BGTU) {
- BranchInst.setOpcode(Mips::BNE);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
return false;
}
if (AcceptsEquality) {
// If both registers are $0 and the pseudo-branch accepts equality, it
// will always be taken, so we emit an unconditional branch.
- BranchInst.setOpcode(Mips::BEQ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
Warning(IDLoc, "branch is always taken");
return false;
}
@@ -2449,11 +2844,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
// the pseudo-branch will always be taken, so we emit an unconditional
// branch.
// This only applies to unsigned pseudo-branches.
- BranchInst.setOpcode(Mips::BEQ);
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
Warning(IDLoc, "branch is always taken");
return false;
}
@@ -2470,21 +2862,17 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
//
// Because only BLEU and BGEU branch on equality, we can use the
// AcceptsEquality variable to decide when to emit the BEQZ.
- BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE);
- BranchInst.addOperand(
- MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg));
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
+ IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
+ MCOperand::createExpr(OffsetExpr), IDLoc, Instructions);
return false;
}
// If we have a signed pseudo-branch and one of the registers is $0,
// we can use an appropriate compare-to-zero branch. We select which one
// to use in the switch statement above.
- BranchInst.setOpcode(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode);
- BranchInst.addOperand(MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
+ IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr),
+ IDLoc, Instructions);
return false;
}
@@ -2494,7 +2882,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
if (!ATRegNum)
return true;
- warnIfNoMacro(IDLoc);
+ if (!EmittedNoMacroWarning)
+ warnIfNoMacro(IDLoc);
// SLT fits well with 2 of our 4 pseudo-branches:
// BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and
@@ -2511,23 +2900,135 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
//
// The same applies to the unsigned variants, except that SLTu is used
// instead of SLT.
- MCInst SetInst;
- SetInst.setOpcode(IsUnsigned ? Mips::SLTu : Mips::SLT);
- SetInst.addOperand(MCOperand::createReg(ATRegNum));
- SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? TrgReg : SrcReg));
- SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? SrcReg : TrgReg));
- Instructions.push_back(SetInst);
-
- BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE);
- BranchInst.addOperand(MCOperand::createReg(ATRegNum));
- BranchInst.addOperand(MCOperand::createReg(Mips::ZERO));
- BranchInst.addOperand(MCOperand::createExpr(OffsetExpr));
- Instructions.push_back(BranchInst);
+ emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
+ ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg,
+ IDLoc, Instructions);
+
+ emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
+ : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
+ ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+ Instructions);
return false;
}
-bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
- SmallVectorImpl<MCInst> &Instructions) {
+bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions,
+ const bool IsMips64, const bool Signed) {
+ if (hasMips32r6()) {
+ Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+ return false;
+ }
+
+ warnIfNoMacro(IDLoc);
+
+ const MCOperand &RsRegOp = Inst.getOperand(0);
+ assert(RsRegOp.isReg() && "expected register operand kind");
+ unsigned RsReg = RsRegOp.getReg();
+
+ const MCOperand &RtRegOp = Inst.getOperand(1);
+ assert(RtRegOp.isReg() && "expected register operand kind");
+ unsigned RtReg = RtRegOp.getReg();
+ unsigned DivOp;
+ unsigned ZeroReg;
+
+ if (IsMips64) {
+ DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
+ ZeroReg = Mips::ZERO_64;
+ } else {
+ DivOp = Signed ? Mips::SDIV : Mips::UDIV;
+ ZeroReg = Mips::ZERO;
+ }
+
+ bool UseTraps = useTraps();
+
+ if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
+ if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
+ Warning(IDLoc, "dividing zero by zero");
+ if (IsMips64) {
+ if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
+ if (UseTraps) {
+ emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+ return false;
+ }
+
+ emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+ return false;
+ }
+ } else {
+ emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+ return false;
+ }
+ }
+
+ if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
+ Warning(IDLoc, "division by zero");
+ if (Signed) {
+ if (UseTraps) {
+ emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+ return false;
+ }
+
+ emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+ return false;
+ }
+ }
+
+ // FIXME: The values for these two BranchTarget variables may be different in
+ // micromips. These magic numbers need to be removed.
+ unsigned BranchTargetNoTraps;
+ unsigned BranchTarget;
+
+ if (UseTraps) {
+ BranchTarget = IsMips64 ? 12 : 8;
+ emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions);
+ } else {
+ BranchTarget = IsMips64 ? 20 : 16;
+ BranchTargetNoTraps = 8;
+ // Branch to the li instruction.
+ emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc,
+ Instructions);
+ }
+
+ emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions);
+
+ if (!UseTraps)
+ emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions);
+
+ if (!Signed) {
+ emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+ return false;
+ }
+
+ unsigned ATReg = getATReg(IDLoc);
+ if (!ATReg)
+ return true;
+
+ emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, Instructions);
+ if (IsMips64) {
+ // Branch to the mflo instruction.
+ emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
+ emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, Instructions);
+ emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, Instructions);
+ } else {
+ // Branch to the mflo instruction.
+ emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions);
+ emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, Instructions);
+ }
+
+ if (UseTraps)
+ emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, Instructions);
+ else {
+ // Branch to the mflo instruction.
+ emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, Instructions);
+ emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, Instructions);
+ emitII(Mips::BREAK, 0x6, 0, IDLoc, Instructions);
+ }
+ emitR(Mips::MFLO, RsReg, IDLoc, Instructions);
+ return false;
+}
+
+bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
if (hasMips32r6() || hasMips64r6()) {
Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
return false;
@@ -2562,7 +3063,7 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
LoadedOffsetInAT = true;
if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
- IDLoc, Instructions))
+ true, IDLoc, Instructions))
return true;
// NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -2590,33 +3091,15 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc,
unsigned SllReg = LoadedOffsetInAT ? DstReg : ATReg;
- MCInst TmpInst;
- TmpInst.setOpcode(Mips::LBu);
- TmpInst.addOperand(MCOperand::createReg(FirstLbuDstReg));
- TmpInst.addOperand(MCOperand::createReg(LbuSrcReg));
- TmpInst.addOperand(MCOperand::createImm(FirstLbuOffset));
- Instructions.push_back(TmpInst);
-
- TmpInst.clear();
- TmpInst.setOpcode(Mips::LBu);
- TmpInst.addOperand(MCOperand::createReg(SecondLbuDstReg));
- TmpInst.addOperand(MCOperand::createReg(LbuSrcReg));
- TmpInst.addOperand(MCOperand::createImm(SecondLbuOffset));
- Instructions.push_back(TmpInst);
-
- TmpInst.clear();
- TmpInst.setOpcode(Mips::SLL);
- TmpInst.addOperand(MCOperand::createReg(SllReg));
- TmpInst.addOperand(MCOperand::createReg(SllReg));
- TmpInst.addOperand(MCOperand::createImm(8));
- Instructions.push_back(TmpInst);
-
- TmpInst.clear();
- TmpInst.setOpcode(Mips::OR);
- TmpInst.addOperand(MCOperand::createReg(DstReg));
- TmpInst.addOperand(MCOperand::createReg(DstReg));
- TmpInst.addOperand(MCOperand::createReg(ATReg));
- Instructions.push_back(TmpInst);
+ emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
+ FirstLbuOffset, IDLoc, Instructions);
+
+ emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc,
+ Instructions);
+
+ emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, Instructions);
+
+ emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, Instructions);
return false;
}
@@ -2654,7 +3137,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
warnIfNoMacro(IDLoc);
if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(),
- IDLoc, Instructions))
+ true, IDLoc, Instructions))
return true;
// NOTE: We do this (D)ADDu here instead of doing it in loadImmediate()
@@ -2677,37 +3160,373 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc,
RightLoadOffset = LoadedOffsetInAT ? 3 : (OffsetValue + 3);
}
- MCInst LeftLoadInst;
- LeftLoadInst.setOpcode(Mips::LWL);
- LeftLoadInst.addOperand(DstRegOp);
- LeftLoadInst.addOperand(MCOperand::createReg(FinalSrcReg));
- LeftLoadInst.addOperand(MCOperand::createImm(LeftLoadOffset));
- Instructions.push_back(LeftLoadInst);
+ emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc,
+ Instructions);
- MCInst RightLoadInst;
- RightLoadInst.setOpcode(Mips::LWR);
- RightLoadInst.addOperand(DstRegOp);
- RightLoadInst.addOperand(MCOperand::createReg(FinalSrcReg));
- RightLoadInst.addOperand(MCOperand::createImm(RightLoadOffset ));
- Instructions.push_back(RightLoadInst);
+ emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset, IDLoc,
+ Instructions);
return false;
}
+bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+
+ assert (Inst.getNumOperands() == 3 && "Invalid operand count");
+ assert (Inst.getOperand(0).isReg() &&
+ Inst.getOperand(1).isReg() &&
+ Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+ unsigned ATReg = Mips::NoRegister;
+ unsigned FinalDstReg = Mips::NoRegister;
+ unsigned DstReg = Inst.getOperand(0).getReg();
+ unsigned SrcReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+
+ bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+
+ unsigned FinalOpcode = Inst.getOpcode();
+
+ if (DstReg == SrcReg) {
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+ FinalDstReg = DstReg;
+ DstReg = ATReg;
+ }
+
+ if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Instructions)) {
+ switch (FinalOpcode) {
+ default:
+ llvm_unreachable("unimplemented expansion");
+ case (Mips::ADDi):
+ FinalOpcode = Mips::ADD;
+ break;
+ case (Mips::ADDiu):
+ FinalOpcode = Mips::ADDu;
+ break;
+ case (Mips::ANDi):
+ FinalOpcode = Mips::AND;
+ break;
+ case (Mips::NORImm):
+ FinalOpcode = Mips::NOR;
+ break;
+ case (Mips::ORi):
+ FinalOpcode = Mips::OR;
+ break;
+ case (Mips::SLTi):
+ FinalOpcode = Mips::SLT;
+ break;
+ case (Mips::SLTiu):
+ FinalOpcode = Mips::SLTu;
+ break;
+ case (Mips::XORi):
+ FinalOpcode = Mips::XOR;
+ break;
+ }
+
+ if (FinalDstReg == Mips::NoRegister)
+ emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, Instructions);
+ else
+ emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc,
+ Instructions);
+ return false;
+ }
+ return true;
+}
+
+bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ unsigned TReg = Inst.getOperand(2).getReg();
+ unsigned TmpReg = DReg;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips32r2()) {
+
+ if (DReg == SReg) {
+ TmpReg = getATReg(Inst.getLoc());
+ if (!TmpReg)
+ return true;
+ }
+
+ if (Inst.getOpcode() == Mips::ROL) {
+ emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+ emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::ROR) {
+ emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips32()) {
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::ROL:
+ FirstShift = Mips::SRLV;
+ SecondShift = Mips::SLLV;
+ break;
+ case Mips::ROR:
+ FirstShift = Mips::SLLV;
+ SecondShift = Mips::SRLV;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+ emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
+ emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+ emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm();
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips32r2()) {
+
+ if (Inst.getOpcode() == Mips::ROLImm) {
+ uint64_t MaxShift = 32;
+ uint64_t ShiftValue = ImmValue;
+ if (ImmValue != 0)
+ ShiftValue = MaxShift - ImmValue;
+ emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::RORImm) {
+ emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips32()) {
+
+ if (ImmValue == 0) {
+ emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::ROLImm:
+ FirstShift = Mips::SLL;
+ SecondShift = Mips::SRL;
+ break;
+ case Mips::RORImm:
+ FirstShift = Mips::SRL;
+ SecondShift = Mips::SLL;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), Instructions);
+ emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), Instructions);
+ emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ unsigned TReg = Inst.getOperand(2).getReg();
+ unsigned TmpReg = DReg;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ if (hasMips64r2()) {
+
+ if (TmpReg == SReg) {
+ TmpReg = getATReg(Inst.getLoc());
+ if (!TmpReg)
+ return true;
+ }
+
+ if (Inst.getOpcode() == Mips::DROL) {
+ emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+ emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ if (Inst.getOpcode() == Mips::DROR) {
+ emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (hasMips64()) {
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::DROL:
+ FirstShift = Mips::DSRLV;
+ SecondShift = Mips::DSLLV;
+ break;
+ case Mips::DROR:
+ FirstShift = Mips::DSLLV;
+ SecondShift = Mips::DSRLV;
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions);
+ emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions);
+ emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions);
+ emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+ return false;
+ }
+
+ return true;
+}
+
+bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+
+ unsigned ATReg = Mips::NoRegister;
+ unsigned DReg = Inst.getOperand(0).getReg();
+ unsigned SReg = Inst.getOperand(1).getReg();
+ int64_t ImmValue = Inst.getOperand(2).getImm() % 64;
+
+ unsigned FirstShift = Mips::NOP;
+ unsigned SecondShift = Mips::NOP;
+
+ MCInst TmpInst;
+
+ if (hasMips64r2()) {
+
+ unsigned FinalOpcode = Mips::NOP;
+ if (ImmValue == 0)
+ FinalOpcode = Mips::DROTR;
+ else if (ImmValue % 32 == 0)
+ FinalOpcode = Mips::DROTR32;
+ else if ((ImmValue >= 1) && (ImmValue <= 32)) {
+ if (Inst.getOpcode() == Mips::DROLImm)
+ FinalOpcode = Mips::DROTR32;
+ else
+ FinalOpcode = Mips::DROTR;
+ } else if (ImmValue >= 33) {
+ if (Inst.getOpcode() == Mips::DROLImm)
+ FinalOpcode = Mips::DROTR;
+ else
+ FinalOpcode = Mips::DROTR32;
+ }
+
+ uint64_t ShiftValue = ImmValue % 32;
+ if (Inst.getOpcode() == Mips::DROLImm)
+ ShiftValue = (32 - ImmValue % 32) % 32;
+
+ emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions);
+
+ return false;
+ }
+
+ if (hasMips64()) {
+
+ if (ImmValue == 0) {
+ emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), Instructions);
+ return false;
+ }
+
+ switch (Inst.getOpcode()) {
+ default:
+ llvm_unreachable("unexpected instruction opcode");
+ case Mips::DROLImm:
+ if ((ImmValue >= 1) && (ImmValue <= 31)) {
+ FirstShift = Mips::DSLL;
+ SecondShift = Mips::DSRL32;
+ }
+ if (ImmValue == 32) {
+ FirstShift = Mips::DSLL32;
+ SecondShift = Mips::DSRL32;
+ }
+ if ((ImmValue >= 33) && (ImmValue <= 63)) {
+ FirstShift = Mips::DSLL32;
+ SecondShift = Mips::DSRL;
+ }
+ break;
+ case Mips::DRORImm:
+ if ((ImmValue >= 1) && (ImmValue <= 31)) {
+ FirstShift = Mips::DSRL;
+ SecondShift = Mips::DSLL32;
+ }
+ if (ImmValue == 32) {
+ FirstShift = Mips::DSRL32;
+ SecondShift = Mips::DSLL32;
+ }
+ if ((ImmValue >= 33) && (ImmValue <= 63)) {
+ FirstShift = Mips::DSRL32;
+ SecondShift = Mips::DSLL;
+ }
+ break;
+ }
+
+ ATReg = getATReg(Inst.getLoc());
+ if (!ATReg)
+ return true;
+
+ emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), Instructions);
+ emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), Instructions);
+ emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions);
+
+ return false;
+ }
+
+ return true;
+}
+
void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions) {
- MCInst NopInst;
- if (hasShortDelaySlot) {
- NopInst.setOpcode(Mips::MOVE16_MM);
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- } else {
- NopInst.setOpcode(Mips::SLL);
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- NopInst.addOperand(MCOperand::createReg(Mips::ZERO));
- NopInst.addOperand(MCOperand::createImm(0));
- }
- Instructions.push_back(NopInst);
+ if (hasShortDelaySlot)
+ emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, Instructions);
+ else
+ emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, Instructions);
}
void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
@@ -2717,6 +3536,24 @@ void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
Instructions);
}
+void MipsAsmParser::createCpRestoreMemOp(
+ bool IsLoad, int StackOffset, SMLoc IDLoc,
+ SmallVectorImpl<MCInst> &Instructions) {
+ // If the offset can not fit into 16 bits, we need to expand.
+ if (!isInt<16>(StackOffset)) {
+ MCInst MemInst;
+ MemInst.setOpcode(IsLoad ? Mips::LW : Mips::SW);
+ MemInst.addOperand(MCOperand::createReg(Mips::GP));
+ MemInst.addOperand(MCOperand::createReg(Mips::SP));
+ MemInst.addOperand(MCOperand::createImm(StackOffset));
+ expandMemInst(MemInst, IDLoc, Instructions, IsLoad, true /*HasImmOpnd*/);
+ return;
+ }
+
+ emitRRI(IsLoad ? Mips::LW : Mips::SW, Mips::GP, Mips::SP, StackOffset, IDLoc,
+ Instructions);
+}
+
unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
// As described by the Mips32r2 spec, the registers Rd and Rs for
// jalr.hb must be different.
@@ -2729,6 +3566,17 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Success;
}
+static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
+ uint64_t ErrorInfo) {
+ if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) {
+ SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+ if (ErrorLoc == SMLoc())
+ return Loc;
+ return ErrorLoc;
+ }
+ return Loc;
+}
+
bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -2745,7 +3593,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if (processInstruction(Inst, IDLoc, Instructions))
return true;
for (unsigned i = 0; i < Instructions.size(); i++)
- Out.EmitInstruction(Instructions[i], STI);
+ Out.EmitInstruction(Instructions[i], getSTI());
return false;
}
case Match_MissingFeature:
@@ -2757,7 +3605,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction");
- ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc();
+ ErrorLoc = Operands[ErrorInfo]->getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
}
@@ -2768,6 +3616,58 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, "invalid instruction");
case Match_RequiresDifferentSrcAndDst:
return Error(IDLoc, "source and destination must be different");
+ case Match_Immz:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
+ case Match_UImm1_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 1-bit unsigned immediate");
+ case Match_UImm2_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 2-bit unsigned immediate");
+ case Match_UImm2_1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 1 .. 4");
+ case Match_UImm3_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 3-bit unsigned immediate");
+ case Match_UImm4_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 4-bit unsigned immediate");
+ case Match_UImm5_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 5-bit unsigned immediate");
+ case Match_UImm5_1:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 1 .. 32");
+ case Match_UImm5_32:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 32 .. 63");
+ case Match_UImm5_33:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected immediate in range 33 .. 64");
+ case Match_UImm5_0_Report_UImm6:
+ // This is used on UImm5 operands that have a corresponding UImm5_32
+ // operand to avoid confusing the user.
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit unsigned immediate");
+ case Match_UImm5_Lsl2:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected both 7-bit unsigned immediate and multiple of 4");
+ case Match_UImm6_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit unsigned immediate");
+ case Match_SImm6:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 6-bit signed immediate");
+ case Match_UImm7_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 7-bit unsigned immediate");
+ case Match_UImm8_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 8-bit unsigned immediate");
+ case Match_UImm10_0:
+ return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+ "expected 10-bit unsigned immediate");
}
llvm_unreachable("Implement any new match types added!");
@@ -3264,7 +4164,7 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
const AsmToken &Tok = Parser.getTok(); // Get the next token.
if (Tok.isNot(AsmToken::LParen)) {
MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
- if (Mnemonic.getToken() == "la") {
+ if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") {
SMLoc E =
SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
@@ -3598,12 +4498,15 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
if (RegRange) {
// Remove last register operand because registers from register range
// should be inserted first.
- if (RegNo == Mips::RA) {
+ if ((isGP64bit() && RegNo == Mips::RA_64) ||
+ (!isGP64bit() && RegNo == Mips::RA)) {
Regs.push_back(RegNo);
} else {
unsigned TmpReg = PrevReg + 1;
while (TmpReg <= RegNo) {
- if ((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) {
+ if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) ||
+ (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) &&
+ isGP64bit())) {
Error(E, "invalid register operand");
return MatchOperand_ParseFail;
}
@@ -3615,16 +4518,23 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) {
RegRange = false;
} else {
- if ((PrevReg == Mips::NoRegister) && (RegNo != Mips::S0) &&
- (RegNo != Mips::RA)) {
+ if ((PrevReg == Mips::NoRegister) &&
+ ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) ||
+ (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) {
Error(E, "$16 or $31 expected");
return MatchOperand_ParseFail;
- } else if (((RegNo < Mips::S0) || (RegNo > Mips::S7)) &&
- (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+ } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA ||
+ (RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
+ !isGP64bit()) ||
+ ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 ||
+ (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
+ isGP64bit()))) {
Error(E, "invalid register operand");
return MatchOperand_ParseFail;
} else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
- (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+ ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) ||
+ (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 &&
+ isGP64bit()))) {
Error(E, "consecutive register numbers expected");
return MatchOperand_ParseFail;
}
@@ -4152,6 +5062,7 @@ bool MipsAsmParser::parseSetPopDirective() {
if (AssemblerOptions.size() == 2)
return reportParseError(Loc, ".set pop with no .set push");
+ MCSubtargetInfo &STI = copySTI();
AssemblerOptions.pop_back();
setAvailableFeatures(
ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
@@ -4225,6 +5136,7 @@ bool MipsAsmParser::parseSetMips0Directive() {
return reportParseError("unexpected token, expected end of statement");
// Reset assembler options to their initial values.
+ MCSubtargetInfo &STI = copySTI();
setAvailableFeatures(
ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
@@ -4366,6 +5278,14 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) {
return true;
}
+// Used to determine if .cpload, .cprestore, and .cpsetup have any effect.
+// In this class, it is only used for .cprestore.
+// FIXME: Only keep track of IsPicEnabled in one place, instead of in both
+// MipsTargetELFStreamer and MipsAsmParser.
+bool MipsAsmParser::isPicAndNotNxxAbi() {
+ return inPicMode() && !(isABI_N32() || isABI_N64());
+}
+
bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
if (AssemblerOptions.back()->isReorder())
Warning(Loc, ".cpload should be inside a noreorder section");
@@ -4398,6 +5318,54 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
return false;
}
+bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
+ MCAsmParser &Parser = getParser();
+
+ // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+ // is used in non-PIC mode.
+
+ if (inMips16Mode()) {
+ reportParseError(".cprestore is not supported in Mips16 mode");
+ return false;
+ }
+
+ // Get the stack offset value.
+ const MCExpr *StackOffset;
+ int64_t StackOffsetVal;
+ if (Parser.parseExpression(StackOffset)) {
+ reportParseError("expected stack offset value");
+ return false;
+ }
+
+ if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) {
+ reportParseError("stack offset is not an absolute expression");
+ return false;
+ }
+
+ if (StackOffsetVal < 0) {
+ Warning(Loc, ".cprestore with negative stack offset has no effect");
+ IsCpRestoreSet = false;
+ } else {
+ IsCpRestoreSet = true;
+ CpRestoreOffset = StackOffsetVal;
+ }
+
+ // If this is not the end of the statement, report an error.
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ reportParseError("unexpected token, expected end of statement");
+ return false;
+ }
+
+ // Store the $gp on the stack.
+ SmallVector<MCInst, 3> StoreInsts;
+ createCpRestoreMemOp(false /*IsLoad*/, CpRestoreOffset /*StackOffset*/, Loc,
+ StoreInsts);
+
+ getTargetStreamer().emitDirectiveCpRestore(StoreInsts, CpRestoreOffset);
+ Parser.Lex(); // Consume the EndOfStatement.
+ return false;
+}
+
bool MipsAsmParser::parseDirectiveCPSetup() {
MCAsmParser &Parser = getParser();
unsigned FuncReg;
@@ -4427,16 +5395,19 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
ResTy = parseAnyRegister(TmpReg);
if (ResTy == MatchOperand_NoMatch) {
- const AsmToken &Tok = Parser.getTok();
- if (Tok.is(AsmToken::Integer)) {
- Save = Tok.getIntVal();
- SaveIsReg = false;
- Parser.Lex();
- } else {
- reportParseError("expected save register or stack offset");
+ const MCExpr *OffsetExpr;
+ int64_t OffsetVal;
+ SMLoc ExprLoc = getLexer().getLoc();
+
+ if (Parser.parseExpression(OffsetExpr) ||
+ !OffsetExpr->evaluateAsAbsolute(OffsetVal)) {
+ reportParseError(ExprLoc, "expected save register or stack offset");
Parser.eatToEndOfStatement();
return false;
}
+
+ Save = OffsetVal;
+ SaveIsReg = false;
} else {
MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
if (!SaveOpnd.isGPRAsmReg()) {
@@ -4462,11 +5433,20 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
}
const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+ CpSaveLocation = Save;
+ CpSaveLocationIsRegister = SaveIsReg;
+
getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
SaveIsReg);
return false;
}
+bool MipsAsmParser::parseDirectiveCPReturn() {
+ getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation,
+ CpSaveLocationIsRegister);
+ return false;
+}
+
bool MipsAsmParser::parseDirectiveNaN() {
MCAsmParser &Parser = getParser();
if (getLexer().isNot(AsmToken::EndOfStatement)) {
@@ -4655,6 +5635,9 @@ bool MipsAsmParser::parseDirectiveOption() {
StringRef Option = Tok.getIdentifier();
if (Option == "pic0") {
+ // MipsAsmParser needs to know if the current PIC mode changes.
+ IsPicEnabled = false;
+
getTargetStreamer().emitDirectiveOptionPic0();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
@@ -4666,6 +5649,9 @@ bool MipsAsmParser::parseDirectiveOption() {
}
if (Option == "pic2") {
+ // MipsAsmParser needs to know if the current PIC mode changes.
+ IsPicEnabled = true;
+
getTargetStreamer().emitDirectiveOptionPic2();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
@@ -4924,6 +5910,8 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".cpload")
return parseDirectiveCpLoad(DirectiveID.getLoc());
+ if (IDVal == ".cprestore")
+ return parseDirectiveCpRestore(DirectiveID.getLoc());
if (IDVal == ".dword") {
parseDataDirective(8, DirectiveID.getLoc());
return false;
@@ -4974,6 +5962,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
getTargetStreamer().emitDirectiveEnt(*Sym);
CurrentFn = Sym;
+ IsCpRestoreSet = false;
return false;
}
@@ -5002,6 +5991,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
getTargetStreamer().emitDirectiveEnd(SymbolName);
CurrentFn = nullptr;
+ IsCpRestoreSet = false;
return false;
}
@@ -5073,6 +6063,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
ReturnRegOpnd.getGPR32Reg());
+ IsCpRestoreSet = false;
return false;
}
@@ -5173,6 +6164,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".cpsetup")
return parseDirectiveCPSetup();
+ if (IDVal == ".cpreturn")
+ return parseDirectiveCPReturn();
+
if (IDVal == ".module")
return parseDirectiveModule();
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index a34ba3bd0ee8..3c1a771f97e9 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder);
+
// DecodeJumpTargetMM - Decode microMIPS jump target, which is
// shifted left by 1 bit.
static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
@@ -241,17 +248,42 @@ static DecodeStatus DecodeMem(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeCacheOp(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder);
@@ -261,6 +293,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
@@ -284,6 +321,11 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -330,6 +372,11 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeSimm4(MCInst &Inst,
unsigned Value,
uint64_t Address,
@@ -340,23 +387,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
uint64_t Address,
const void *Decoder);
-// Decode the immediate field of an LSA instruction which
-// is off by one.
-static DecodeStatus DecodeLSAImm(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder);
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeInsSize(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder);
-static DecodeStatus DecodeExtSize(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder);
-
static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
@@ -830,9 +869,24 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
if (IsMicroMips) {
Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+ if (Result == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+
+ if (hasMips32r6()) {
+ DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+ // Calling the auto-generated decoder function for microMIPS32R6
+ // (and microMIPS64R6) 16-bit instructions.
+ Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+ Address, this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 2;
+ return Result;
+ }
+ }
DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
- // Calling the auto-generated decoder function.
+ // Calling the auto-generated decoder function for microMIPS 16-bit
+ // instructions.
Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
this, STI);
if (Result != MCDisassembler::Fail) {
@@ -847,24 +901,33 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
if (hasMips32r6()) {
DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
// Calling the auto-generated decoder function.
- Result = decodeInstruction(DecoderTableMicroMips32r632, Instr, Insn, Address,
- this, STI);
- } else {
- DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
- // Calling the auto-generated decoder function.
- Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+ Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
this, STI);
+ if (Result != MCDisassembler::Fail) {
+ Size = 4;
+ return Result;
+ }
}
+
+ DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+ // Calling the auto-generated decoder function.
+ Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+ this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
return Result;
}
+ // This is an invalid instruction. Let the disassembler move forward by the
+ // minimum instruction size.
+ Size = 2;
return MCDisassembler::Fail;
}
Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
- if (Result == MCDisassembler::Fail)
+ if (Result == MCDisassembler::Fail) {
+ Size = 4;
return MCDisassembler::Fail;
+ }
if (hasCOP3()) {
DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
@@ -925,6 +988,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
return Result;
}
+ Size = 4;
return MCDisassembler::Fail;
}
@@ -1079,10 +1143,66 @@ static DecodeStatus DecodeMem(MCInst &Inst,
Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
- if(Inst.getOpcode() == Mips::SC ||
- Inst.getOpcode() == Mips::SCD){
+ if (Inst.getOpcode() == Mips::SC ||
+ Inst.getOpcode() == Mips::SCD)
Inst.addOperand(MCOperand::createReg(Reg));
- }
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn >> 7);
+ unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+ unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if (Inst.getOpcode() == Mips::SCE)
+ Inst.addOperand(MCOperand::createReg(Reg));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
Inst.addOperand(MCOperand::createReg(Reg));
Inst.addOperand(MCOperand::createReg(Base));
@@ -1125,11 +1245,28 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder) {
- int Offset = fieldFromInstruction(Insn, 7, 9);
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+ unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+ Inst.addOperand(MCOperand::createImm(Hint));
+
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn >> 7);
unsigned Hint = fieldFromInstruction(Insn, 16, 5);
unsigned Base = fieldFromInstruction(Insn, 21, 5);
@@ -1142,6 +1279,24 @@ static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeSyncI(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -1157,6 +1312,21 @@ static DecodeStatus DecodeSyncI(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Immediate = SignExtend32<16>(Insn & 0xffff);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Immediate));
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
@@ -1220,8 +1390,11 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
return MCDisassembler::Fail;
break;
case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder)
== MCDisassembler::Fail)
return MCDisassembler::Fail;
@@ -1240,14 +1413,17 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
Inst.addOperand(MCOperand::createImm(Offset));
break;
case Mips::SB16_MM:
+ case Mips::SB16_MMR6:
Inst.addOperand(MCOperand::createImm(Offset));
break;
case Mips::LHU16_MM:
case Mips::SH16_MM:
+ case Mips::SH16_MMR6:
Inst.addOperand(MCOperand::createImm(Offset << 1));
break;
case Mips::LW16_MM:
case Mips::SW16_MM:
+ case Mips::SW16_MMR6:
Inst.addOperand(MCOperand::createImm(Offset << 2));
break;
}
@@ -1291,7 +1467,16 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
unsigned Insn,
uint64_t Address,
const void *Decoder) {
- int Offset = SignExtend32<4>(Insn & 0xf);
+ int Offset;
+ switch (Inst.getOpcode()) {
+ case Mips::LWM16_MMR6:
+ case Mips::SWM16_MMR6:
+ Offset = fieldFromInstruction(Insn, 4, 4);
+ break;
+ default:
+ Offset = SignExtend32<4>(Insn & 0xf);
+ break;
+ }
if (DecodeRegListOperand16(Inst, Insn, Address, Decoder)
== MCDisassembler::Fail)
@@ -1303,6 +1488,27 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ int Offset = SignExtend32<9>(Insn & 0x1ff);
+ unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+ unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+ Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+ Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+ if (Inst.getOpcode() == Mips::SCE_MM)
+ Inst.addOperand(MCOperand::createReg(Reg));
+
+ Inst.addOperand(MCOperand::createReg(Reg));
+ Inst.addOperand(MCOperand::createReg(Base));
+ Inst.addOperand(MCOperand::createImm(Offset));
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -1659,6 +1865,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+ unsigned Offset,
+ uint64_t Address,
+ const void *Decoder) {
+ int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+ Inst.addOperand(MCOperand::createImm(BranchOffset));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -1700,6 +1916,14 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+ unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeSimm4(MCInst &Inst,
unsigned Value,
uint64_t Address,
@@ -1716,12 +1940,12 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeLSAImm(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- // We add one to the immediate field as it was encoded as 'imm - 1'.
- Inst.addOperand(MCOperand::createImm(Insn + 1));
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+ uint64_t Address,
+ const void *Decoder) {
+ Value &= ((1 << Bits) - 1);
+ Inst.addOperand(MCOperand::createImm(Value + Offset));
return MCDisassembler::Success;
}
@@ -1736,15 +1960,6 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
return MCDisassembler::Success;
}
-static DecodeStatus DecodeExtSize(MCInst &Inst,
- unsigned Insn,
- uint64_t Address,
- const void *Decoder) {
- int Size = (int) Insn + 1;
- Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
- return MCDisassembler::Success;
-}
-
static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4));
@@ -1792,15 +2007,21 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst,
uint64_t Address,
const void *Decoder) {
unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
- Mips::S6, Mips::FP};
+ Mips::S6, Mips::S7, Mips::FP};
unsigned RegNum;
unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
+
// Empty register lists are not allowed.
if (RegLst == 0)
return MCDisassembler::Fail;
RegNum = RegLst & 0xf;
+
+ // RegLst values 10-15, and 26-31 are reserved.
+ if (RegNum > 9)
+ return MCDisassembler::Fail;
+
for (unsigned i = 0; i < RegNum; i++)
Inst.addOperand(MCOperand::createReg(Regs[i]));
@@ -1814,7 +2035,16 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder) {
unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
- unsigned RegLst = fieldFromInstruction(Insn, 4, 2);
+ unsigned RegLst;
+ switch(Inst.getOpcode()) {
+ default:
+ RegLst = fieldFromInstruction(Insn, 4, 2);
+ break;
+ case Mips::LWM16_MMR6:
+ case Mips::SWM16_MMR6:
+ RegLst = fieldFromInstruction(Insn, 8, 2);
+ break;
+ }
unsigned RegNum = RegLst & 0x3;
for (unsigned i = 0; i <= RegNum; i++)
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index a5637b16b636..a7b7d2e080ee 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -235,7 +235,9 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
case Mips::SWM32_MM:
case Mips::LWM32_MM:
case Mips::SWM16_MM:
+ case Mips::SWM16_MMR6:
case Mips::LWM16_MM:
+ case Mips::LWM16_MMR6:
opNum = MI->getNumOperands() - 2;
break;
}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 713f35c70830..0e61ea61899a 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -73,8 +73,6 @@ enum CondCode {
const char *MipsFCCToString(Mips::CondCode CC);
} // end namespace Mips
-class TargetMachine;
-
class MipsInstPrinter : public MCInstPrinter {
public:
MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 8e6c9e69b223..cdcc3923b81e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -23,7 +23,7 @@ static const MCPhysReg Mips64IntRegs[8] = {
Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
}
-const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
if (IsO32())
return makeArrayRef(O32IntRegs);
if (IsN32() || IsN64())
@@ -31,7 +31,7 @@ const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
llvm_unreachable("Unhandled ABI");
}
-const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
if (IsO32())
return makeArrayRef(O32IntRegs);
if (IsN32() || IsN64())
@@ -78,7 +78,6 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
.Case("mips32r3", MipsABIInfo::O32())
.Case("mips32r5", MipsABIInfo::O32())
.Case("mips32r6", MipsABIInfo::O32())
- .Case("mips16", MipsABIInfo::O32())
.Case("mips3", MipsABIInfo::N64())
.Case("mips4", MipsABIInfo::N64())
.Case("mips5", MipsABIInfo::N64())
@@ -107,6 +106,10 @@ unsigned MipsABIInfo::GetNullPtr() const {
return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
}
+unsigned MipsABIInfo::GetZeroReg() const {
+ return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
unsigned MipsABIInfo::GetPtrAdduOp() const {
return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu;
}
@@ -115,6 +118,10 @@ unsigned MipsABIInfo::GetPtrAddiuOp() const {
return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu;
}
+unsigned MipsABIInfo::GetGPRMoveOp() const {
+ return ArePtrs64bit() ? Mips::OR64 : Mips::OR;
+}
+
unsigned MipsABIInfo::GetEhDataReg(unsigned I) const {
static const unsigned EhDataReg[] = {
Mips::A0, Mips::A1, Mips::A2, Mips::A3
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 40c5681acc17..ffa2c765e79b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -47,10 +47,10 @@ public:
ABI GetEnumValue() const { return ThisABI; }
/// The registers to use for byval arguments.
- const ArrayRef<MCPhysReg> GetByValArgRegs() const;
+ ArrayRef<MCPhysReg> GetByValArgRegs() const;
/// The registers to use for the variable argument list.
- const ArrayRef<MCPhysReg> GetVarArgRegs() const;
+ ArrayRef<MCPhysReg> GetVarArgRegs() const;
/// Obtain the size of the area allocated by the callee for arguments.
/// CallingConv::FastCall affects the value for O32.
@@ -67,9 +67,12 @@ public:
unsigned GetFramePtr() const;
unsigned GetBasePtr() const;
unsigned GetNullPtr() const;
+ unsigned GetZeroReg() const;
unsigned GetPtrAdduOp() const;
unsigned GetPtrAddiuOp() const;
+ unsigned GetGPRMoveOp() const;
inline bool ArePtrs64bit() const { return IsN64(); }
+ inline bool AreGprs64bit() const { return IsN32() || IsN64(); }
unsigned GetEhDataReg(unsigned I) const;
};
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 328e71720cac..e4865e2455ee 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -63,15 +63,19 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
// address range. Forcing a signed division because Value can be negative.
Value = (int64_t)Value / 4;
// We now check if Value can be encoded as a 16-bit signed immediate.
- if (!isIntN(16, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup");
+ if (!isInt<16>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MIPS_PC19_S2:
// Forcing a signed division because Value can be negative.
Value = (int64_t)Value / 4;
// We now check if Value can be encoded as a 19-bit signed immediate.
- if (!isIntN(19, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC19 fixup");
+ if (!isInt<19>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+ return 0;
+ }
break;
case Mips::fixup_Mips_26:
// So far we are only using this type for jumps.
@@ -104,45 +108,57 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
// Forcing a signed division because Value can be negative.
Value = (int64_t) Value / 2;
// We now check if Value can be encoded as a 7-bit signed immediate.
- if (!isIntN(7, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC7 fixup");
+ if (!isInt<7>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MICROMIPS_PC10_S1:
Value -= 2;
// Forcing a signed division because Value can be negative.
Value = (int64_t) Value / 2;
// We now check if Value can be encoded as a 10-bit signed immediate.
- if (!isIntN(10, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC10 fixup");
+ if (!isInt<10>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MICROMIPS_PC16_S1:
Value -= 4;
// Forcing a signed division because Value can be negative.
Value = (int64_t)Value / 2;
// We now check if Value can be encoded as a 16-bit signed immediate.
- if (!isIntN(16, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup");
+ if (!isInt<16>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MIPS_PC18_S3:
// Forcing a signed division because Value can be negative.
Value = (int64_t)Value / 8;
// We now check if Value can be encoded as a 18-bit signed immediate.
- if (!isIntN(18, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC18 fixup");
+ if (!isInt<18>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MIPS_PC21_S2:
// Forcing a signed division because Value can be negative.
Value = (int64_t) Value / 4;
// We now check if Value can be encoded as a 21-bit signed immediate.
- if (!isIntN(21, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC21 fixup");
+ if (!isInt<21>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+ return 0;
+ }
break;
case Mips::fixup_MIPS_PC26_S2:
// Forcing a signed division because Value can be negative.
Value = (int64_t) Value / 4;
// We now check if Value can be encoded as a 26-bit signed immediate.
- if (!isIntN(26, Value) && Ctx)
- Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+ if (!isInt<26>(Value) && Ctx) {
+ Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+ return 0;
+ }
break;
}
@@ -232,6 +248,18 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
}
}
+bool MipsAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const {
+ if (Name == "R_MIPS_NONE") {
+ MappedKind = (MCFixupKind)Mips::fixup_Mips_NONE;
+ return true;
+ }
+ if (Name == "R_MIPS_32") {
+ MappedKind = FK_Data_4;
+ return true;
+ }
+ return MCAsmBackend::getFixupKind(Name, MappedKind);
+}
+
const MCFixupKindInfo &MipsAsmBackend::
getFixupKindInfo(MCFixupKind Kind) const {
const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
@@ -239,6 +267,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
// MipsFixupKinds.h.
//
// name offset bits flags
+ { "fixup_Mips_NONE", 0, 0, 0 },
{ "fixup_Mips_16", 0, 16, 0 },
{ "fixup_Mips_32", 0, 32, 0 },
{ "fixup_Mips_REL32", 0, 32, 0 },
@@ -304,6 +333,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
// MipsFixupKinds.h.
//
// name offset bits flags
+ { "fixup_Mips_NONE", 0, 0, 0 },
{ "fixup_Mips_16", 16, 16, 0 },
{ "fixup_Mips_32", 0, 32, 0 },
{ "fixup_Mips_REL32", 0, 32, 0 },
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index b3d5a4964f86..1c9af9227ffe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -41,6 +41,7 @@ public:
void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value, bool IsPCRel) const override;
+ bool getFixupKind(StringRef Name, MCFixupKind &MappedKind) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
unsigned getNumFixupKinds() const override {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 9b2952720edd..5b9f02b89be5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -68,6 +68,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
unsigned Kind = (unsigned)Fixup.getKind();
switch (Kind) {
+ case Mips::fixup_Mips_NONE:
+ return ELF::R_MIPS_NONE;
case Mips::fixup_Mips_16:
case FK_Data_2:
return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
@@ -325,13 +327,24 @@ static void setMatch(MipsRelocationEntry &Hi, MipsRelocationEntry &Lo) {
// matching LO;
// - prefer LOs without a pair;
// - prefer LOs with higher offset;
+
+static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
+ const ELFRelocationEntry &A = *AP;
+ const ELFRelocationEntry &B = *BP;
+ if (A.Offset != B.Offset)
+ return B.Offset - A.Offset;
+ if (B.Type != A.Type)
+ return A.Type - B.Type;
+ return 0;
+}
+
void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
std::vector<ELFRelocationEntry> &Relocs) {
if (Relocs.size() < 2)
return;
- // The default function sorts entries by Offset in descending order.
- MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
+ // Sorts entries by Offset in descending order.
+ array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel);
// Init MipsRelocs from Relocs.
std::vector<MipsRelocationEntry> MipsRelocs;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index b45d9cf621d7..e7d687e89a8a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -63,7 +63,7 @@ void MipsELFStreamer::SwitchSection(MCSection *Section,
}
void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
- const SMLoc &Loc) {
+ SMLoc Loc) {
MCELFStreamer::EmitValueImpl(Value, Size, Loc);
Labels.clear();
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index af9311fa4288..a241cdebdcc8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -60,8 +60,7 @@ public:
/// Overriding this function allows us to dismiss all labels that are
/// candidates for marking as microMIPS when .word directive is emitted.
- void EmitValueImpl(const MCExpr *Value, unsigned Size,
- const SMLoc &Loc) override;
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
/// Emits all the option records stored up until the point it's called.
void EmitMipsOptionRecords();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index e601963264de..3652f4bab0d4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -23,8 +23,11 @@ namespace Mips {
// in MipsAsmBackend.cpp.
//
enum Fixups {
+ // Branch fixups resulting in R_MIPS_NONE.
+ fixup_Mips_NONE = FirstTargetFixupKind,
+
// Branch fixups resulting in R_MIPS_16.
- fixup_Mips_16 = FirstTargetFixupKind,
+ fixup_Mips_16,
// Pure 32 bit data fixup resulting in - R_MIPS_32.
fixup_Mips_32,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 5d23fcbd7a44..d4ccf0349c16 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -17,13 +17,14 @@
#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
- class Triple;
+class Triple;
- class MipsMCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit MipsMCAsmInfo(const Triple &TheTriple);
- };
+class MipsMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit MipsMCAsmInfo(const Triple &TheTriple);
+};
} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index e36263d54ca4..4b030ebfce8c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -190,6 +190,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
else
NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips);
+ // Check whether it is Dsp instruction.
+ if (NewOpcode == -1)
+ NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp);
+
if (NewOpcode != -1) {
if (Fixups.size() > N)
Fixups.pop_back();
@@ -346,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
return 0;
}
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+ const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ // If the destination is an immediate, divide by 2.
+ if (MO.isImm())
+ return MO.getImm() >> 1;
+
+ // TODO: Push 26 PC fixup.
+ return 0;
+}
+
/// getJumpOffset16OpValue - Return binary encoding of the jump
/// target operand. If the machine operand requires relocation,
/// record the relocation and return zero.
@@ -745,7 +766,8 @@ getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
const MCSubtargetInfo &STI) const {
// Register is encoded in bits 9-5, offset is encoded in bits 4-0.
assert(MI.getOperand(OpNo).isReg() &&
- MI.getOperand(OpNo).getReg() == Mips::SP &&
+ (MI.getOperand(OpNo).getReg() == Mips::SP ||
+ MI.getOperand(OpNo).getReg() == Mips::SP_64) &&
"Unexpected base register!");
unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
Fixups, STI) >> 2;
@@ -769,6 +791,19 @@ getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
}
unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 8-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+ STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI);
+
+ return (OffBits & 0x1FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -792,6 +827,19 @@ getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
}
unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+ assert(MI.getOperand(OpNo).isReg());
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+ STI) << 16;
+ unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+ return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -801,7 +849,9 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
default:
break;
case Mips::SWM16_MM:
+ case Mips::SWM16_MMR6:
case Mips::LWM16_MM:
+ case Mips::LWM16_MMR6:
OpNo = MI.getNumOperands() - 2;
break;
}
@@ -815,15 +865,6 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
return ((OffBits >> 2) & 0x0F);
}
-unsigned
-MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- assert(MI.getOperand(OpNo).isImm());
- unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
- return SizeEncoding - 1;
-}
-
// FIXME: should be called getMSBEncoding
//
unsigned
@@ -838,13 +879,15 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
return Position + Size - 1;
}
+template <unsigned Bits, int Offset>
unsigned
-MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
assert(MI.getOperand(OpNo).isImm());
- // The immediate is encoded as 'immediate - 1'.
- return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
+ unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+ Value -= Offset;
+ return Value;
}
unsigned
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 911cc2f77a45..fdacd172e3a2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -137,6 +137,13 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ // getBranchTarget26OpValueMM - Return binary encoding of the branch
+ // offset operand. If the machine operand requires relocation,
+ // record the relocation and return zero.
+ unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
// getJumpOffset16OpValue - Return binary encoding of the jump
// offset operand. If the machine operand requires relocation,
// record the relocation and return zero.
@@ -172,23 +179,27 @@ public:
unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- // getLSAImmEncoding - Return binary encoding of LSA immediate.
- unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ /// Subtract Offset then encode as a N-bit unsigned integer.
+ template <unsigned Bits, int Offset>
+ unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index fd2ed17ee785..e889972c5c0e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -51,8 +51,8 @@ public:
const MCAsmLayout *Layout,
const MCFixup *Fixup) const override;
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override {
- return getSubExpr()->findAssociatedSection();
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
}
// There are no TLS MipsMCExprs at the moment.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e4da2df75d47..e5fa7556053f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -89,9 +89,15 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() {
void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveCpRestore(
+ SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+ forbidModuleDirective();
+}
void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
const MCSymbol &Sym, bool IsReg) {
}
+void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {}
void MipsTargetStreamer::emitDirectiveModuleFP() {}
@@ -358,6 +364,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
forbidModuleDirective();
}
+void MipsTargetAsmStreamer::emitDirectiveCpRestore(
+ SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+ MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+ OS << "\t.cprestore\t" << Offset << "\n";
+}
+
void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
int RegOrOffset,
const MCSymbol &Sym,
@@ -373,7 +385,13 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
OS << ", ";
- OS << Sym.getName() << "\n";
+ OS << Sym.getName();
+ forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {
+ OS << "\t.cpreturn";
forbidModuleDirective();
}
@@ -595,8 +613,9 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS,
ELF::SHF_ALLOC | ELF::SHT_REL);
+ MCSymbol *Sym = Context.getOrCreateSymbol(Name);
const MCSymbolRefExpr *ExprRef =
- MCSymbolRefExpr::create(Name, MCSymbolRefExpr::VK_None, Context);
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
MCA.registerSection(*Sec);
Sec->setAlignment(4);
@@ -622,10 +641,25 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
OS.PopSection();
+
+ // .end also implicitly sets the size.
+ MCSymbol *CurPCSym = Context.createTempSymbol();
+ OS.EmitLabel(CurPCSym);
+ const MCExpr *Size = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
+ ExprRef, Context);
+ int64_t AbsSize;
+ if (!Size->evaluateAsAbsolute(AbsSize, MCA))
+ llvm_unreachable("Function size must be evaluatable as absolute");
+ Size = MCConstantExpr::create(AbsSize, Context);
+ static_cast<MCSymbolELF *>(Sym)->setSize(Size);
}
void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+ // .ent also acts like an implicit '.type symbol, STT_FUNC'
+ static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC);
}
void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
@@ -752,6 +786,24 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
forbidModuleDirective();
}
+void MipsTargetELFStreamer::emitDirectiveCpRestore(
+ SmallVector<MCInst, 3> &StoreInsts, int Offset) {
+ MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset);
+ // .cprestore offset
+ // When PIC mode is enabled and the O32 ABI is used, this directive expands
+ // to:
+ // sw $gp, offset($sp)
+ // and adds a corresponding LW after every JAL.
+
+ // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+ // is used in non-PIC mode.
+ if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+ return;
+
+ for (const MCInst &Inst : StoreInsts)
+ getStreamer().EmitInstruction(Inst, STI);
+}
+
void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
int RegOrOffset,
const MCSymbol &Sym,
@@ -766,7 +818,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
// Either store the old $gp in a register or on the stack
if (IsReg) {
// move $save, $gpreg
- Inst.setOpcode(Mips::DADDu);
+ Inst.setOpcode(Mips::OR64);
Inst.addOperand(MCOperand::createReg(RegOrOffset));
Inst.addOperand(MCOperand::createReg(Mips::GP));
Inst.addOperand(MCOperand::createReg(Mips::ZERO));
@@ -810,6 +862,30 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
forbidModuleDirective();
}
+void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) {
+ // Only N32 and N64 emit anything for .cpreturn iff PIC is set.
+ if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+ return;
+
+ MCInst Inst;
+ // Either restore the old $gp from a register or on the stack
+ if (SaveLocationIsRegister) {
+ Inst.setOpcode(Mips::OR);
+ Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(SaveLocation));
+ Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+ } else {
+ Inst.setOpcode(Mips::LD);
+ Inst.addOperand(MCOperand::createReg(Mips::GP));
+ Inst.addOperand(MCOperand::createReg(Mips::SP));
+ Inst.addOperand(MCOperand::createImm(SaveLocation));
+ }
+ getStreamer().EmitInstruction(Inst, STI);
+
+ forbidModuleDirective();
+}
+
void MipsTargetELFStreamer::emitMipsAbiFlags() {
MCAssembler &MCA = getStreamer().getAssembler();
MCContext &Context = MCA.getContext();
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 187a022b2563..400f6eef3fb0 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -16,6 +16,64 @@ class MMR6Arch<string opstr> {
string BaseOpcode = opstr;
}
+// Class used for microMIPS32r6 and microMIPS64r6 instructions.
+class MicroMipsR6Inst16 : PredicateControl {
+ string DecoderNamespace = "MicroMipsR6";
+ let InsnPredicates = [HasMicroMips32r6];
+}
+
+class BC16_FM_MM16R6 {
+ bits<10> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x33;
+ let Inst{9-0} = offset;
+}
+
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+ bits<3> rs;
+ bits<7> offset;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = op;
+ let Inst{9-7} = rs;
+ let Inst{6-0} = offset;
+}
+
+class POOL16C_JALRC_FM_MM16R6<bits<5> op> {
+ bits<5> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = rs;
+ let Inst{4-0} = op;
+}
+
+class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> {
+ bits<5> imm;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-5} = imm;
+ let Inst{4-0} = op;
+}
+
+class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> {
+ bits<2> rt;
+ bits<4> addr;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-8} = rt;
+ let Inst{7-4} = addr;
+ let Inst{3-0} = funct;
+}
+
class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst {
bits<5> rd;
bits<5> rt;
@@ -71,6 +129,64 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
let Inst{15-0} = imm16;
}
+class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b1010;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LB32_FM_MMR6 : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class LBU32_FM_MMR6 : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000101;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
+ bits<21> addr;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-12} = 0b0110;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = addr{8-0};
+}
+
class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
: MMR6Arch<instr_asm> {
bits<5> rd;
@@ -124,6 +240,69 @@ class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst {
let Inst{9-0} = funct;
}
+class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = 0;
+ let Inst{15-11} = op;
+ let Inst{10-6} = 0;
+ let Inst{5-0} = 0;
+}
+
+class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> {
+ bits<5> rt;
+ bits<5> rd;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rd;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_RDHWR_FM_MMR6 {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sel;
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = 0;
+ let Inst{13-11} = sel;
+ let Inst{10} = 0;
+ let Inst{9-0} = 0b0111000000;
+}
+
+class POOL32A_SYNC_FM_MMR6 {
+ bits<5> stype;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = stype;
+ let Inst{15-6} = 0b0110101101;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32I_SYNCI_FM_MMR6 {
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> immediate = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = 0b01100;
+ let Inst{20-16} = base;
+ let Inst{15-0} = immediate;
+}
+
class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst {
bits<5> rs;
bits<5> rt;
@@ -198,6 +377,78 @@ class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst {
let Inst{5-0} = funct;
}
+class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b1010;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b011000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0b0110;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class LOAD_WORD_FM_MMR6 {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<16> offset = addr{15-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b111111;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-0} = offset;
+}
+
+class LOAD_UPPER_IMM_FM_MMR6 {
+ bits<5> rt;
+ bits<16> imm16;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b000100;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = 0;
+ let Inst{15-0} = imm16;
+}
+
class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
bits<5> rt;
bits<16> offset;
@@ -222,12 +473,13 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst {
let Inst{15-0} = offset;
}
-class ERET_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> {
+class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm> {
bits<32> Inst;
let Inst{31-26} = 0x00;
let Inst{25-16} = 0x00;
- let Inst{15-6} = 0x3cd;
+ let Inst{15-6} = funct;
let Inst{5-0} = 0x3c;
}
@@ -262,7 +514,8 @@ class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
let Inst{5-0} = 0x0;
}
-class EIDI_MMR6_ENC<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> {
+class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct>
+ : MMR6Arch<instr_asm> {
bits<32> Inst;
bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt
@@ -287,3 +540,323 @@ class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<in
let Inst{10} = rotate;
let Inst{9-0} = funct;
}
+
+class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<21> addr;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = addr{20-16};
+ let Inst{15-0} = addr{15-0};
+}
+
+class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
+ bits<3> funct> : MMR6Arch<instr_asm> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = fmt;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10} = 0;
+ let Inst{9-8} = fmt;
+ let Inst{7-0} = funct;
+}
+
+class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-6} = Cond.Value;
+ let Inst{5-0} = format;
+}
+
+class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14-13} = fmt;
+ let Inst{12-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rs;
+ bits<3> rt;
+ bits<3> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b000001;
+ let Inst{9-7} = rs;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rd;
+ let Inst{0} = 0;
+}
+
+class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = 0b0001;
+}
+
+class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0x11;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = 0b0000;
+}
+
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> {
+ bits<3> rt;
+ bits<3> rs;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-7} = rt;
+ let Inst{6-4} = rs;
+ let Inst{3-0} = op;
+}
+
+class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> {
+ bits<4> code_;
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b010001;
+ let Inst{9-6} = code_;
+ let Inst{5-0} = op;
+}
+
+class POOL16A_SUBU16_FM_MMR6 {
+ bits<3> rs;
+ bits<3> rt;
+ bits<3> rd;
+
+ bits<16> Inst;
+
+ let Inst{15-10} = 0b000001;
+ let Inst{9-7} = rs;
+ let Inst{6-4} = rt;
+ let Inst{3-1} = rd;
+ let Inst{0} = 0b1;
+}
+
+class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+ bits<5> rt;
+ bits<5> rs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x00;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = funct;
+ let Inst{5-0} = 0x3c;
+}
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15} = 0;
+ let Inst{14} = fmt;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = fs;
+ let Inst{20-16} = fd;
+ let Inst{15-11} = 0;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> ft;
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = ft;
+ let Inst{20-16} = fs;
+ let Inst{15-11} = fd;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ bits<5> fs;
+ bits<5> fd;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010101;
+ let Inst{25-21} = fs;
+ let Inst{20-16} = fd;
+ let Inst{15-11} = 0b00000;
+ let Inst{10-9} = fmt;
+ let Inst{8-0} = funct;
+}
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 53bde1379e29..31b5db036daa 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -11,6 +11,13 @@
//
//===----------------------------------------------------------------------===//
+def brtarget26_mm : Operand<OtherVT> {
+ let EncoderMethod = "getBranchTarget26OpValueMM";
+ let OperandType = "OPERAND_PCREL";
+ let DecoderMethod = "DecodeBranchTarget26MM";
+ let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
//===----------------------------------------------------------------------===//
//
// Instruction Encodings
@@ -28,6 +35,9 @@ class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>;
class AUI_MMR6_ENC : AUI_FM_MMR6;
class BALC_MMR6_ENC : BRANCH_OFF26_FM<0b101101>;
class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>;
+class BC16_MMR6_ENC : BC16_FM_MM16R6;
+class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
+class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011101>;
@@ -42,13 +52,19 @@ class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>;
class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>;
class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>;
class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>;
-class EI_MMR6_ENC : EIDI_MMR6_ENC<"ei", 0x15d>;
-class ERET_MMR6_ENC : ERET_FM_MMR6<"eret">;
+class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>;
+class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
+class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
+class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>;
+class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
+class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
class LWPC_MMR6_ENC : PCREL19_FM_MMR6<0b01>;
+class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>;
@@ -59,15 +75,99 @@ class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
+class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
+class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
+class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
+class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
+class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
+class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
+class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
+class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
+class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class LB_MMR6_ENC : LB32_FM_MMR6;
+class LBU_MMR6_ENC : LBU32_FM_MMR6;
+class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
+class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
+class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
+class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
+class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
+class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">;
+class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
+class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
+class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
+class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
+class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
+class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>;
+class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>;
+class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>;
+class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>;
+class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>;
+class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>;
+class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>;
+class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>;
+class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>;
+class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
+class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
+class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
+class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
+class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
+class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
+class RSQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.s", 0, 0b00001000>;
+class RSQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.d", 1, 0b00001000>;
+class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
+class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
+class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
+class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
+class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
+class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
+class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
+class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
+class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>;
+class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+ 0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+ 0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+ 0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+ 0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>;
+class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
+
+class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
+class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
+class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
+class LI16_MMR6_ENC : LI_FM_MM16;
+class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
+class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
+class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
RegisterOperand GPROpnd>
@@ -108,6 +208,43 @@ class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm,
list<Register> Defs = [RA];
}
+/// Floating Point Instructions
+class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
+class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
+class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
+class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
+class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
+class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
+class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
+class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
+class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
+class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
+class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
+class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
+class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
+class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
+class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
+class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
+class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
+class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
+class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
+class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>;
+class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>;
+class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>;
+class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>;
+class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
+
+class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
+class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
+class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
+class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
+class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
+class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
+class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
+class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
+class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
+class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
+
//===----------------------------------------------------------------------===//
//
// Instruction Descriptions
@@ -130,11 +267,34 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd>
bit isBarrier = 1;
}
-class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> {
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> {
bit isCall = 1;
list<Register> Defs = [RA];
}
-class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>;
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>;
+
+class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+ !strconcat("bc16", "\t$offset"), [],
+ II_BC, FrmI>,
+ MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let hasDelaySlot = 0;
+ let AdditionalPredicates = [RelocPIC];
+ let Defs = [AT];
+}
+
+class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
+ : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let hasDelaySlot = 0;
+ let Defs = [AT];
+}
+class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">;
+class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">;
+
class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd>;
class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd>;
@@ -162,6 +322,35 @@ class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd>;
class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd>;
+class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd> :
+ CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd,
+ GPROpnd> {
+ string DecoderMethod = "DecodePrefeOpMM";
+}
+
+class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9, GPR32Opnd>;
+class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9, GPR32Opnd>;
+
+class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd> : MMR6Arch<instr_asm> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins MemOpnd:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeLoadByte15";
+ bit mayLoad = 1;
+}
+class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd>;
+class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd>;
+
+class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+ RegisterOperand GPROpnd>
+ : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd> {
+ let DecoderMethod = "DecodeLoadByte9";
+}
+class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd>;
+class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd>;
+
class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
: MMR6Arch<instr_asm> {
dag OutOperandList = (outs GPROpnd:$rt);
@@ -174,10 +363,22 @@ class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd>;
class EHB_MMR6_DESC : Barrier<"ehb">;
class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd>;
+class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd>;
class ERET_MMR6_DESC : ER_FT<"eret">;
+class DERET_MMR6_DESC : ER_FT<"deret">;
class ERETNC_MMR6_DESC : ER_FT<"eretnc">;
+class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+ : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let isCall = 1;
+ let hasDelaySlot = 0;
+ let Defs = [RA];
+}
+class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>;
+
class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
RegisterOperand GPROpnd>
: MMR6Arch<opstr> {
@@ -200,6 +401,27 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
list<Register> Defs = [AT];
}
+class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+ : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+ [], II_JR, FrmR>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let hasDelaySlot = 0;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
+
+class JRCADDIUSP_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
+ [], II_JRADDIUSP, FrmR>,
+ MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+ let hasDelaySlot = 0;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isBranch = 1;
+ let isIndirectBranch = 1;
+}
+
class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
Operand ImmOpnd> : MMR6Arch<instr_asm> {
dag OutOperandList = (outs GPROpnd:$rd);
@@ -241,7 +463,7 @@ class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
list<dag> Pattern = [];
}
-class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>;
class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
Operand ImmOpnd> : MMR6Arch<instr_asm> {
@@ -264,6 +486,18 @@ class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd>;
class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd>;
+class PAUSE_MMR6_DESC : Barrier<"pause">;
+class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel);
+ string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel");
+ list<dag> Pattern = [];
+ InstrItinClass Itinerary = II_RDHWR;
+ Format Form = FrmR;
+}
+
+class WAIT_MMR6_DESC : WaitMM<"wait">;
+class SSNOP_MMR6_DESC : Barrier<"ssnop">;
class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>;
class DIV_MMR6_DESC : ArithLogicR<"div", GPR32Opnd>;
class DIVU_MMR6_DESC : ArithLogicR<"divu", GPR32Opnd>;
@@ -277,13 +511,426 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", simm16, GPR32Opnd>;
class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd>;
class XORI_MMR6_DESC : ArithLogicI<"xori", simm16, GPR32Opnd>;
+class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
+ SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary,
+ ComplexPattern Addr = addr> :
+ InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+ let DecoderMethod = "DecodeMem";
+ let mayStore = 1;
+}
+class SW_MMR6_DESC : Store<"sw", GPR32Opnd>;
+class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9>;
+
+class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
+ : MMR6Arch<instr_asm> {
+ dag InOperandList = (ins RO:$rs);
+ dag OutOperandList = (outs RO:$rt);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+ list<dag> Pattern = [];
+ Format f = FrmR;
+ string BaseOpcode = instr_asm;
+ bit hasSideEffects = 0;
+}
+class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd>;
+class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd>;
+
+/// Floating Point Instructions
+class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
+ InstrItinClass Itin, bit isComm,
+ SDPatternOperator OpNode = null_frag> : HARDFLOAT {
+ dag OutOperandList = (outs RC:$fd);
+ dag InOperandList = (ins RC:$ft, RC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+ list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))];
+ InstrItinClass Itinerary = Itin;
+ bit isCommutable = isComm;
+}
+class FADD_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
+class FADD_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
+class FSUB_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
+class FSUB_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
+class FMUL_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
+class FMUL_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
+class FDIV_S_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
+class FDIV_D_MMR6_DESC
+ : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
+class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>, HARDFLOAT;
+class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>, HARDFLOAT;
+class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>, HARDFLOAT;
+class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>, HARDFLOAT;
+
+class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+}
+class FMOV_S_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
+class FMOV_D_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
+class FNEG_S_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
+class FNEG_D_MMR6_DESC
+ : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
+
+class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>, HARDFLOAT;
+class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>, HARDFLOAT;
+class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>, HARDFLOAT;
+class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>, HARDFLOAT;
+
+class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>, HARDFLOAT;
+class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>, HARDFLOAT;
+class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>, HARDFLOAT;
+class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>, HARDFLOAT;
+
+class CVT_MMR6_DESC_BASE<
+ string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC,
+ InstrItinClass Itin, SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+}
+
+class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
+ II_CVT>;
+class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
+ II_CVT>;
+class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
+ II_CVT>, FGR_64;
+class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
+ II_CVT>;
+class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
+ II_CVT>, FGR_64;
+
+multiclass CMP_CC_MMR6<bits<6> format, string Typestr,
+ RegisterOperand FGROpnd> {
+ def CMP_AF_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>,
+ CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_UN_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>,
+ CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_EQ_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>,
+ CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_UEQ_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>,
+ CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_LT_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>,
+ CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_ULT_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>,
+ CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_LE_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>,
+ CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_ULE_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>,
+ CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SAF_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>,
+ CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SUN_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>,
+ CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SEQ_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>,
+ CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SUEQ_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>,
+ CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SLT_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>,
+ CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SULT_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>,
+ CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SLE_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>,
+ CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+ def CMP_SULE_#NAME : POOL32F_CMP_FM<
+ !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>,
+ CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel,
+ ISA_MICROMIPS32R6;
+}
+
+class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+ RegisterOperand SrcRC, InstrItinClass Itin,
+ SDPatternOperator OpNode = null_frag>
+ : HARDFLOAT, NeverHasSideEffects {
+ dag OutOperandList = (outs DstRC:$ft);
+ dag InOperandList = (ins SrcRC:$fs);
+ string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+ list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+ InstrItinClass Itinerary = Itin;
+ Format Form = FrmFR;
+ list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+
+class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
+ II_ABS, fabs>;
+class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
+ II_ABS, fabs>;
+class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
+ FGR32Opnd, II_FLOOR>;
+class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
+ FGR64Opnd, II_FLOOR>;
+class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd,
+ FGR32Opnd, II_FLOOR>;
+class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd,
+ AFGR64Opnd, II_FLOOR>;
+class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd,
+ FGR32Opnd, II_CEIL>;
+class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd,
+ FGR64Opnd, II_CEIL>;
+class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd,
+ FGR32Opnd, II_CEIL>;
+class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd,
+ AFGR64Opnd, II_CEIL>;
+class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd,
+ FGR32Opnd, II_TRUNC>;
+class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
+ FGR64Opnd, II_TRUNC>;
+class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>;
+class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
+ AFGR64Opnd, II_TRUNC>;
+class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
+ II_SQRT_S, fsqrt>;
+class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
+ II_SQRT_D, fsqrt>;
+class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>;
+class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd,
+ AFGR64Opnd, II_TRUNC>;
+class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd,
+ FGR32Opnd, II_ROUND>;
+class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd,
+ II_ROUND>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+ FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+ FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+ FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+ FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+ // We must insert a SUBREG_TO_REG around $fd_in
+ bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
+
+class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO>
+ : Store<opstr, RO>, MMR6Arch<opstr> {
+ let DecoderMethod = "DecodeMemMMImm16";
+}
+class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd>;
+
+class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins RO:$rt, mem_mm_9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeStoreEvaOpMM";
+ bit mayStore = 1;
+}
+class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd>;
+class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd>;
+class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd>;
+class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd>;
+class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> :
+ MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins mem_mm_12:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ string DecoderMethod = "DecodeMemMMImm9";
+ bit mayLoad = 1;
+}
+class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd>;
+class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd>;
+class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+ MMR6Arch<"addu16">;
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+ MMR6Arch<"and16">;
+class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
+ MMR6Arch<"andi16">;
+class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16">;
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
+ MMR6Arch<"or16">;
+class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+ MMR6Arch<"sll16">;
+class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+ MMR6Arch<"srl16">;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"srl16">,
+ MicroMipsR6Inst16;
+class LI16_MMR6_DESC : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>,
+ MMR6Arch<"srl16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"srl16">,
+ MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16">, MMR6Arch<"sdbbp16">,
+ MicroMipsR6Inst16;
+class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+ MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+ MMR6Arch<"sdbbp16">, MicroMipsR6Inst16;
+
+class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins mem:$addr);
+ string AsmString = "lw\t$rt, $addr";
+ let DecoderMethod = "DecodeMemMMImm16";
+ let canFoldAsLoad = 1;
+ let mayLoad = 1;
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))];
+ InstrItinClass Itinerary = II_LW;
+}
+
+class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins uimm16:$imm16);
+ string AsmString = "lui\t$rt, $imm16";
+ list<dag> Pattern = [];
+ bit hasSideEffects = 0;
+ bit isReMaterializable = 1;
+ InstrItinClass Itinerary = II_LUI;
+ Format Form = FrmI;
+}
+
+class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins i32imm:$stype);
+ string AsmString = !strconcat("sync", "\t$stype");
+ list<dag> Pattern = [(MipsSync imm:$stype)];
+ InstrItinClass Itinerary = NoItinerary;
+ bit HasSideEffects = 1;
+}
+
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+ let DecoderMethod = "DecodeSynciR6";
+}
+
+class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins GPR32Opnd:$rd);
+ string AsmString = !strconcat("rdpgpr", "\t$rt, $rd");
+}
+
+class SDBBP_MMR6_DESC : MipsR6Inst {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins uimm20:$code_);
+ string AsmString = !strconcat("sdbbp", "\t$code_");
+ list<dag> Pattern = [];
+}
+
+class LWM16_MMR6_DESC
+ : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+ !strconcat("lwm16", "\t$rt, $addr"), [],
+ NoItinerary, FrmI>,
+ MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayLoad = 1;
+ InstrItinClass Itin = NoItinerary;
+ ComplexPattern Addr = addr;
+}
+
+class SWM16_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+ !strconcat("swm16", "\t$rt, $addr"), [],
+ NoItinerary, FrmI>,
+ MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+ let mayStore = 1;
+ InstrItinClass Itin = NoItinerary;
+ ComplexPattern Addr = addr;
+}
+
+class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+ SDPatternOperator OpNode, InstrItinClass Itin,
+ Operand MemOpnd>
+ : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
+ MMR6Arch<opstr>, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMImm4";
+ let mayStore = 1;
+}
+class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd,
+ truncstorei8, II_SB, mem_mm_4>;
+class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd,
+ truncstorei16, II_SH, mem_mm_4_lsl1>;
+class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
+ store, II_SW, mem_mm_4_lsl2>;
+
+class SWSP_MMR6_DESC
+ : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
+ !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
+ MMR6Arch<"sw">, MicroMipsR6Inst16 {
+ let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+ let mayStore = 1;
+}
+
//===----------------------------------------------------------------------===//
//
// Instruction Definitions
//
//===----------------------------------------------------------------------===//
-let DecoderNamespace = "MicroMips32r6" in {
+let DecoderNamespace = "MicroMipsR6" in {
def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6;
def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6;
def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -298,6 +945,11 @@ def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6;
def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6;
def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6;
def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC,
ISA_MICROMIPS32R6;
def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC,
@@ -320,13 +972,21 @@ def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6;
def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6;
def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6;
def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6;
-def ERET_MMR6 : R6MMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
ISA_MICROMIPS32R6;
+def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
+ ISA_MICROMIPS32R6;
def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6;
@@ -337,17 +997,211 @@ def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6;
def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
+def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
ISA_MICROMIPS32R6;
def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
ISA_MICROMIPS32R6;
+def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6;
def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6;
def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6;
def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
+def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
+def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
+def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
+def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
+def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
+def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderMethod = "DecodeMemMMImm16" in {
+ def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+let DecoderMethod = "DecodeMemMMImm9" in {
+ def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+/// Floating Point Instructions
+def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd>;
+defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd>;
+def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def RSQRT_S_MMR6 : StdMMR6Rel, RSQRT_S_MMR6_ENC, RSQRT_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def RSQRT_D_MMR6 : StdMMR6Rel, RSQRT_D_MMR6_ENC, RSQRT_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
+def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
+ ISA_MICROMIPS32R6;
+def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+ ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+ ISA_MICROMIPS32R6;
}
//===----------------------------------------------------------------------===//
@@ -357,4 +1211,23 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
//===----------------------------------------------------------------------===//
def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6;
+def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+ !strconcat("b", "\t$offset")> {
+ string DecoderNamespace = "MicroMipsR6";
+}
+def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"rdhwr $rt, $rs",
+ (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+ ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+ (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6;
diff --git a/lib/Target/Mips/MicroMips64r6InstrFormats.td b/lib/Target/Mips/MicroMips64r6InstrFormats.td
new file mode 100644
index 000000000000..da305a2d508a
--- /dev/null
+++ b/lib/Target/Mips/MicroMips64r6InstrFormats.td
@@ -0,0 +1,86 @@
+//=- MicroMips64r6InstrFormats.td - Instruction Formats -*- tablegen -* -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS64r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_FM_MMR6 {
+ bits<5> rt;
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b111100;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm;
+}
+
+class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> {
+ bits<5> rs;
+ bits<16> imm;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010000;
+ let Inst{25-21} = funct;
+ let Inst{20-16} = rs;
+ let Inst{15-0} = imm;
+}
+
+class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> size;
+ bits<5> pos;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = size;
+ let Inst{10-6} = pos;
+ let Inst{5-0} = funct;
+}
+
+class POOL32S_DALIGN_FM_MMR6 {
+ bits<5> rs;
+ bits<5> rt;
+ bits<5> rd;
+ bits<3> bp;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rs;
+ let Inst{20-16} = rt;
+ let Inst{15-11} = rd;
+ let Inst{10-8} = bp;
+ let Inst{7-6} = 0b00;
+ let Inst{5-0} = 0b011100;
+}
+
+class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct>
+ : MMR6Arch<instr_asm> {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0b010110;
+ let Inst{25-21} = rd;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rt;
+ let Inst{10-9} = 0b00;
+ let Inst{8-0} = funct;
+}
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
new file mode 100644
index 000000000000..ec1aef86a942
--- /dev/null
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -0,0 +1,119 @@
+//=- MicroMips64r6InstrInfo.td - Instruction Information -*- tablegen -*- -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_ENC : DAUI_FM_MMR6;
+class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>;
+class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>;
+class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>;
+class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>;
+class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>;
+class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6;
+class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>;
+class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>;
+class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>;
+class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins GPROpnd:$rs, simm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+ list<dag> Pattern = [];
+}
+class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd>;
+
+class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rs);
+ dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+ string Constraints = "$rs = $rt";
+}
+class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd>;
+
+class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
+ Operand SizeOpnd, SDPatternOperator Op = null_frag>
+ : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size");
+ list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))];
+ InstrItinClass Itinerary = II_EXT;
+ Format Form = FrmR;
+ string BaseOpcode = instr_asm;
+}
+// TODO: Add 'pos + size' constraint check to dext* instructions
+// DEXT: 0 < pos + size <= 63
+// DEXTM, DEXTU: 32 < pos + size <= 64
+class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5,
+ uimm5_plus1, MipsExt>;
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
+ uimm5_plus33, MipsExt>;
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
+ uimm5_plus1, MipsExt>;
+
+class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ Operand ImmOpnd> : MMR6Arch<instr_asm>, MipsR6Inst {
+ dag OutOperandList = (outs GPROpnd:$rd);
+ dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+ list<dag> Pattern = [];
+}
+
+class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
+
+class DDIV_MM64R6_DESC : ArithLogicR<"ddiv", GPR32Opnd>;
+class DMOD_MM64R6_DESC : ArithLogicR<"dmod", GPR32Opnd>;
+class DDIVU_MM64R6_DESC : ArithLogicR<"ddivu", GPR32Opnd>;
+class DMODU_MM64R6_DESC : ArithLogicR<"dmodu", GPR32Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+ def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6;
+ def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6;
+ def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6;
+ def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC,
+ ISA_MICROMIPS64R6;
+ def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+ def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC,
+ ISA_MICROMIPS64R6;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
new file mode 100644
index 000000000000..f11c09abfc36
--- /dev/null
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -0,0 +1,244 @@
+//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MMDSPInst<string opstr = "">
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ let AdditionalPredicates = [InMicroMips];
+ string BaseOpcode = opstr;
+ string Arch = "mmdsp";
+ let DecoderNamespace = "MicroMips";
+}
+
+class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+ : InstAlias<Asm, Result, Emit>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ let AdditionalPredicates = [InMicroMips];
+}
+
+class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<5> rs;
+ bits<5> rt;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11-0} = op;
+}
+
+class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<3> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-13} = sa;
+ let Inst{12-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = sa;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11} = 0b0;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<4> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-12} = sa;
+ let Inst{11-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> imm;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = imm;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<5> rs;
+ bits<5> sa;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = rs;
+ let Inst{15-11} = sa;
+ let Inst{10-0} = op;
+}
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+ bits<5> index;
+ bits<5> base;
+ bits<5> rd;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0b0;
+ let Inst{9-0} = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = 0;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = funct;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<7> mask;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = rt;
+ let Inst{20-14} = mask;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<5> rd;
+ bits<10> imm;
+
+ let Inst{31-26} = 0;
+ let Inst{25-16} = imm;
+ let Inst{15-11} = rd;
+ let Inst{10} = 0;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+ bits<5> rt;
+ bits<8> imm;
+
+ let Inst{31-26} = 0;
+ let Inst{25-21} = rt;
+ let Inst{20-13} = imm;
+ let Inst{12} = 0;
+ let Inst{11-6} = op;
+ let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+ bits<6> shift;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-22} = 0b0000;
+ let Inst{21-16} = shift;
+ let Inst{15-14} = ac;
+ let Inst{13-10} = 0b0000;
+ let Inst{9-0} = op;
+}
+
+class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+ bits<5> rs;
+ bits<2> ac;
+
+ let Inst{31-26} = 0b000000;
+ let Inst{25-21} = 0b00000;
+ let Inst{20-16} = rs;
+ let Inst{15-14} = ac;
+ let Inst{13-6} = op;
+ let Inst{5-0} = 0b111100;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
new file mode 100644
index 000000000000..b342e2371df4
--- /dev/null
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -0,0 +1,528 @@
+//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips DSP instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Instruction encoding.
+class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>;
+class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>;
+class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>;
+class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>;
+class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>;
+class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>;
+class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>;
+class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>;
+class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>;
+class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>;
+class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>;
+class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>;
+class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>;
+class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>;
+class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>;
+class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>;
+class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>;
+class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>;
+class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>;
+class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>;
+class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>;
+class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>;
+class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>;
+class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>;
+class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>;
+class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>;
+class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>;
+class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>;
+class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>;
+class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>;
+class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>;
+class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>;
+class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>;
+class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>;
+class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>;
+class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>;
+class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>;
+class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>;
+class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>;
+class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>;
+class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>;
+class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>;
+class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>;
+class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>;
+class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>;
+class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>;
+class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>;
+class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>;
+class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>;
+class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>;
+class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>;
+class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>;
+class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>;
+class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>;
+class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>;
+class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>;
+class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>;
+class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>;
+class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>;
+class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>;
+class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>;
+class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>;
+class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>;
+class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>;
+class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>;
+class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>;
+class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>;
+class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>;
+class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>;
+class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>;
+class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>;
+class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>;
+class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>;
+class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>;
+class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>;
+class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>;
+class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>;
+class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>;
+class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>;
+class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>;
+class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>;
+class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>;
+class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>;
+class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>;
+class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>;
+class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>;
+class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>;
+class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>;
+class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>;
+class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>;
+class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>;
+class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>;
+class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>;
+class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>;
+class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>;
+class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>;
+class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>;
+class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>;
+class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>;
+class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>;
+class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>;
+class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>;
+class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>;
+class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>;
+class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
+class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
+class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
+class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+ : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+ : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+ : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
+class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>;
+class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>;
+class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>;
+class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>;
+class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>;
+class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>;
+
+// Instruction desc.
+class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand ROD,
+ RegisterOperand ROS = ROD> {
+ dag OutOperandList = (outs ROD:$rt);
+ dag InOperandList = (ins ROS:$rs);
+ string AsmString = !strconcat(opstr, "\t$rt, $rs");
+ list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))];
+ InstrItinClass Itinerary = itin;
+}
+class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+ "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>;
+
+class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ SDPatternOperator ImmPat, InstrItinClass itin,
+ RegisterOperand RO, Operand ImmOpnd> {
+ dag OutOperandList = (outs RO:$rt);
+ dag InOperandList = (ins RO:$rs, ImmOpnd:$sa);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+ list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))];
+ InstrItinClass Itinerary = itin;
+ bit hasSideEffects = 1;
+}
+class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>,
+ Defs<[DSPOutFlag22]>;
+class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>,
+ Defs<[DSPOutFlag22]>;
+class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>;
+class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+ "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+ "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+
+class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin, RegisterOperand RO> {
+ dag OutOperandList = (outs RO:$rd);
+ dag InOperandList = (ins RO:$rt, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+ list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))];
+ InstrItinClass Itinerary = itin;
+}
+class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>,
+ Defs<[DSPOutFlag22]>;
+class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>;
+class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>;
+class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>;
+class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>;
+class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>;
+class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+ "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>;
+
+class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs");
+ InstrItinClass Itinerary = itin;
+}
+class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
+ InstrItinClass Itinerary = itin;
+}
+
+class EXTP_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTPDP_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPDPV_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPV_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+ Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTR_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_R_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_RS_W_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTR_S_H_MM_DESC
+ : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_R_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_RS_W_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+class EXTRV_S_H_MM_DESC
+ : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
+ Defs<[DSPOutFlag23]>;
+
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+ InstrItinClass itin> {
+ dag OutOperandList = (outs GPR32Opnd:$rs);
+ dag InOperandList = (ins RO:$ac);
+ string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+ list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+ InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+ NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+ NoItinerary>;
+
+class RADDU_W_QB_MM_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins DSPROpnd:$rs);
+ string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+ InstrItinClass Itinerary = NoItinerary;
+ string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+ dag OutOperandList = (outs GPR32Opnd:$rt);
+ dag InOperandList = (ins uimm16:$mask);
+ string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+ dag OutOperandList = (outs DSPROpnd:$rt);
+ dag InOperandList = (ins uimm16:$imm);
+ string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+ list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+ NoItinerary, DSPROpnd,
+ GPR32Opnd>;
+
+class WRDSP_MM_DESC {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
+ string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
+ list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+ InstrItinClass Itinerary = NoItinerary;
+}
+
+// Instruction defs.
+// microMIPS DSP Rev 1
+def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
+def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC;
+def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC;
+def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC;
+def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC;
+def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC;
+def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC;
+def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC;
+def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC;
+def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC;
+def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC;
+def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC;
+def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC;
+def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC;
+def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC;
+def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC;
+def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC;
+def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC;
+def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC;
+def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC;
+def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC;
+def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC;
+def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC;
+def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC;
+def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC;
+def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC;
+def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC;
+def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC;
+def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC;
+def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC;
+def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC;
+def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC;
+def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC;
+def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC;
+def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC;
+def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC;
+def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC,
+ PRECEQU_PH_QBLA_MM_DESC;
+def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC;
+def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC,
+ PRECEQU_PH_QBRA_MM_DESC;
+def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC;
+def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC;
+def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC;
+def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC;
+def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC;
+def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC;
+def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC;
+def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC;
+def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC;
+def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC;
+def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC;
+def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC;
+def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC;
+def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC;
+def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC;
+def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC;
+def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC;
+def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC;
+def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC;
+def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC;
+def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC;
+def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC;
+def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC;
+def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
+def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
+def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
+def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC;
+def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC;
+def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC;
+def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC;
+def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC;
+def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC;
+// microMIPS DSP Rev 2
+def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
+ ISA_DSPR2;
+def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC,
+ ISA_DSPR2;
+def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC,
+ ISA_DSPR2;
+def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2;
+def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC,
+ ISA_DSPR2;
+def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2;
+def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC,
+ ISA_DSPR2;
+def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2;
+def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2;
+def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC,
+ ISA_DSPR2;
+def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC,
+ ISA_DSPR2;
+def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+ ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+ PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+ PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Instruction alias.
+def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>;
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 004b0d51f4b4..756e6c92c1d1 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -37,23 +37,14 @@ def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
CEQS_FM_MM<1>;
-def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>,
BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
-
-def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
- ROUND_W_FM_MM<0, 0x6c>;
def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
ROUND_W_FM_MM<0, 0x24>;
-def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
- ROUND_W_FM_MM<0, 0x2c>;
-def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
ROUND_W_FM_MM<0, 0xec>;
-def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
- ROUND_W_FM_MM<0, 0xac>;
-def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
- fsqrt>, ROUND_W_FM_MM<0, 0x28>;
def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
ROUND_W_FM_MM<1, 0x6c>;
@@ -61,7 +52,7 @@ def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
ROUND_W_FM_MM<1, 0x24>;
def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
ROUND_W_FM_MM<1, 0x2c>;
-def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
ROUND_W_FM_MM<1, 0xec>;
def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
ROUND_W_FM_MM<1, 0xac>;
@@ -146,3 +137,14 @@ def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
MADDS_FM_MM<0x2a>;
}
+
+let AdditionalPredicates = [InMicroMips] in {
+ def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+ II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>;
+ def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+ FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>;
+ def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+ ROUND_W_FM_MM<0, 0x6c>;
+ def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+ fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 560afa48908c..b736367ee5fa 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -389,6 +389,22 @@ class LW_FM_MM<bits<6> op> : MMArch {
let Inst{15-0} = addr{15-0};
}
+class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = fmt;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
class LWL_FM_MM<bits<4> funct> {
bits<5> rt;
bits<21> addr;
@@ -402,6 +418,22 @@ class LWL_FM_MM<bits<4> funct> {
let Inst{11-0} = addr{11-0};
}
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = type;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
class CMov_F_I_FM_MM<bits<7> func> : MMArch {
bits<5> rd;
bits<5> rs;
@@ -655,6 +687,22 @@ class LL_FM_MM<bits<4> funct> {
let Inst{11-0} = addr{11-0};
}
+class LLE_FM_MM<bits<4> funct> {
+ bits<5> rt;
+ bits<21> addr;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = 0x18;
+ let Inst{25-21} = rt;
+ let Inst{20-16} = base;
+ let Inst{15-12} = funct;
+ let Inst{11-9} = 0x6;
+ let Inst{8-0} = offset;
+}
+
class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
bits<5> ft;
bits<5> fs;
@@ -895,7 +943,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch {
let Inst{11-0} = addr{11-0};
}
-class LWM_FM_MM16<bits<4> funct> : MMArch {
+class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
bits<2> rt;
bits<4> addr;
@@ -922,6 +970,37 @@ class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch {
let Inst{11-0} = offset;
}
+class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = hint;
+ let Inst{20-16} = base;
+ let Inst{15-12} = 0xA;
+ let Inst{11-9} = funct;
+ let Inst{8-0} = offset;
+}
+
+class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch {
+ bits<5> index;
+ bits<5> base;
+ bits<5> hint;
+
+ bits<32> Inst;
+
+ let Inst{31-26} = op;
+ let Inst{25-21} = index;
+ let Inst{20-16} = base;
+ let Inst{15-11} = hint;
+ let Inst{10-9} = 0x0;
+ let Inst{8-0} = funct;
+}
+
class BARRIER_FM_MM<bits<5> op> : MMArch {
bits<32> Inst;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 39393840c6f2..99f0f446deab 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -13,11 +13,6 @@ def simm12 : Operand<i32> {
let DecoderMethod = "DecodeSimm12";
}
-def uimm5_lsl2 : Operand<OtherVT> {
- let EncoderMethod = "getUImm5Lsl2Encoding";
- let DecoderMethod = "DecodeUImm5lsl2";
-}
-
def uimm6_lsl2 : Operand<i32> {
let EncoderMethod = "getUImm6Lsl2Encoding";
let DecoderMethod = "DecodeUImm6Lsl2";
@@ -30,6 +25,7 @@ def simm9_addiusp : Operand<i32> {
def uimm3_shift : Operand<i32> {
let EncoderMethod = "getUImm3Mod8Encoding";
+ let DecoderMethod = "DecodePOOL16BEncodedField";
}
def simm3_lsa2 : Operand<i32> {
@@ -105,6 +101,14 @@ def mem_mm_gp_imm7_lsl2 : Operand<i32> {
let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
}
+def mem_mm_9 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops GPR32, simm9);
+ let EncoderMethod = "getMemEncodingMMImm9";
+ let ParserMatchClass = MipsMemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
def mem_mm_12 : Operand<i32> {
let PrintMethod = "printMemOperand";
let MIOperandInfo = (ops GPR32, simm12);
@@ -113,6 +117,14 @@ def mem_mm_12 : Operand<i32> {
let OperandType = "OPERAND_MEMORY";
}
+def mem_mm_16 : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops GPR32, simm16);
+ let EncoderMethod = "getMemEncodingMMImm16";
+ let ParserMatchClass = MipsMemAsmOperand;
+ let OperandType = "OPERAND_MEMORY";
+}
+
def MipsMemUimm4AsmOperand : AsmOperandClass {
let Name = "MemOffsetUimm4";
let SuperClasses = [MipsMemAsmOperand];
@@ -166,7 +178,7 @@ def simm23_lsl2 : Operand<i32> {
class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
RegisterOperand RO> :
InstSE<(outs), (ins RO:$rs, opnd:$offset),
- !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> {
let isBranch = 1;
let isTerminator = 1;
let hasDelaySlot = 0;
@@ -251,6 +263,13 @@ class LLBaseMM<string opstr, RegisterOperand RO> :
let mayLoad = 1;
}
+class LLEBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm9";
+ let mayLoad = 1;
+}
+
class SCBaseMM<string opstr, RegisterOperand RO> :
InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
!strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
@@ -259,6 +278,14 @@ class SCBaseMM<string opstr, RegisterOperand RO> :
let Constraints = "$rt = $dst";
}
+class SCEBaseMM<string opstr, RegisterOperand RO> :
+ InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+ !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+ let DecoderMethod = "DecodeMemMMImm9";
+ let mayStore = 1;
+ let Constraints = "$rt = $dst";
+}
+
class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
InstrItinClass Itin = NoItinerary> :
InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
@@ -392,7 +419,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
// 16-bit Jump and Link (Call)
class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
- [(MipsJmpLink RO:$rs)], IIBranch, FrmR> {
+ [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
let isCall = 1;
let hasDelaySlot = 1;
let Defs = [RA];
@@ -401,7 +428,7 @@ class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
// 16-bit Jump Reg
class JumpRegMM16<string opstr, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
- [], IIBranch, FrmR> {
+ [], II_JR, FrmR> {
let hasDelaySlot = 1;
let isBranch = 1;
let isIndirectBranch = 1;
@@ -410,7 +437,7 @@ class JumpRegMM16<string opstr, RegisterOperand RO> :
// Base class for JRADDIUSP instruction.
class JumpRAddiuStackMM16 :
MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm",
- [], IIBranch, FrmR> {
+ [], II_JRADDIUSP, FrmR> {
let isTerminator = 1;
let isBarrier = 1;
let isBranch = 1;
@@ -420,7 +447,7 @@ class JumpRAddiuStackMM16 :
// 16-bit Jump and Link (Call) - Short Delay Slot
class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
- [], IIBranch, FrmR> {
+ [], II_JALRS, FrmR> {
let isCall = 1;
let hasDelaySlot = 1;
let Defs = [RA];
@@ -429,7 +456,7 @@ class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
// 16-bit Jump Register Compact - No delay slot
class JumpRegCMM16<string opstr, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
- [], IIBranch, FrmR> {
+ [], II_JRC, FrmR> {
let isTerminator = 1;
let isBarrier = 1;
let isBranch = 1;
@@ -444,7 +471,7 @@ class BrkSdbbp16MM<string opstr> :
class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset),
- !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> {
let isBranch = 1;
let isTerminator = 1;
let hasDelaySlot = 1;
@@ -455,18 +482,18 @@ class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
class JumpLinkMM<string opstr, DAGOperand opnd> :
InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
- [], IIBranch, FrmJ, opstr> {
+ [], II_JALS, FrmJ, opstr> {
let DecoderMethod = "DecodeJumpTargetMM";
}
class JumpLinkRegMM<string opstr, RegisterOperand RO>:
InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
- [], IIBranch, FrmR>;
+ [], II_JALRS, FrmR>;
class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd,
RegisterOperand RO> :
InstSE<(outs), (ins RO:$rs, opnd:$offset),
- !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>;
}
class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
@@ -475,6 +502,10 @@ class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
!strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>;
+class PrefetchIndexed<string opstr> :
+ InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint),
+ !strconcat(opstr, "\t$hint, ${index}(${base})"), [], NoItinerary, FrmOther>;
+
class AddImmUPC<string opstr, RegisterOperand RO> :
InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm),
!strconcat(opstr, "\t$rs, $imm"), [], NoItinerary, FrmR>;
@@ -543,7 +574,7 @@ class LoadMultMM16<string opstr,
class UncondBranchMM16<string opstr> :
MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
!strconcat(opstr, "\t$offset"),
- [], IIBranch, FrmI> {
+ [], II_B, FrmI> {
let isBranch = 1;
let isTerminator = 1;
let isBarrier = 1;
@@ -553,21 +584,24 @@ class UncondBranchMM16<string opstr> :
}
def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
- ARITH_FM_MM16<0>;
-def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
- ARITH_FM_MM16<1>;
-def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>;
+ ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
- LOGIC_FM_MM16<0x2>;
-def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
- LOGIC_FM_MM16<0x3>;
-def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
- LOGIC_FM_MM16<0x1>;
-def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>;
+ LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6;
+def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
- SHIFT_FM_MM16<0>;
+ SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
- SHIFT_FM_MM16<1>;
+ SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+
+def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+ ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+ LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6;
def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
@@ -597,7 +631,8 @@ def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16,
IsAsCheapAsAMove;
-def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
@@ -607,8 +642,18 @@ def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
BEQNEZ_FM_MM16<0x2b>;
def B16_MM : UncondBranchMM16<"b16">, B16_FM;
-def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>;
-def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>;
+def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>,
+ ISA_MICROMIPS_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+ /// Load and Store Instructions - multiple
+ def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+ def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>,
+ ISA_MICROMIPS32_NOT_MIPS32R6;
+}
class WaitMM<string opstr> :
InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
@@ -701,6 +746,18 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
def SW_MM : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
}
+ let DecoderMethod = "DecodeMemMMImm9" in {
+ def LBE_MM : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
+ def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
+ def LHE_MM : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+ def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+ def LWE_MM : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
+ def SBE_MM : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
+ def SHE_MM : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
+ def SWE_MM : StoreMemory<"swe", GPR32Opnd, mem_simm9gpr>,
+ POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+ }
+
def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
@@ -714,12 +771,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
LWL_FM_MM<0x8>;
def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
LWL_FM_MM<0x9>;
+ let DecoderMethod = "DecodeMemMMImm9" in {
+ def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_12>,
+ POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
+ def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_12>,
+ POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
+ def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_12>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
+ def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_12>,
+ POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
+ }
/// Load and Store Instructions - multiple
def SWM32_MM : StoreMultMM<"swm32">, LWM_FM_MM<0xd>;
def LWM32_MM : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>;
- def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>;
- def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>;
/// Load and Store Pair Instructions
def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>;
@@ -777,11 +842,11 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
/// Word Swap Bytes Within Halfwords
- def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>,
- ISA_MIPS32R2;
-
- def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
- EXT_FM_MM<0x2c>;
+ def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
+ SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+ // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+ def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
+ MipsExt>, EXT_FM_MM<0x2c>;
def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
EXT_FM_MM<0x0c>;
@@ -854,12 +919,22 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+ def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
+ def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+
let DecoderMethod = "DecodeCacheOpMM" in {
def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12>,
CACHE_PREF_FM_MM<0x08, 0x6>;
def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12>,
CACHE_PREF_FM_MM<0x18, 0x2>;
}
+
+ let DecoderMethod = "DecodePrefeOpMM" in {
+ def PREFE_MM : MMRel, CacheOp<"prefe", mem_mm_9>,
+ CACHE_PREFE_FM_MM<0x18, 0x2>;
+ def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9>,
+ CACHE_PREFE_FM_MM<0x18, 0x3>;
+ }
def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>;
def EHB_MM : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>;
def PAUSE_MM : MMRel, Barrier<"pause">, BARRIER_FM_MM<0x5>;
@@ -870,7 +945,13 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>;
def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM;
- def RDHWR_MM : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM_MM;
+
+ def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+}
+
+let DecoderNamespace = "MicroMips" in {
+ def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
+ RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
}
let Predicates = [InMicroMips] in {
@@ -928,7 +1009,7 @@ class UncondBranchMMPseudo<string opstr> :
MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
!strconcat(opstr, "\t$offset")>;
- def B_MM_Pseudo : UncondBranchMMPseudo<"b">;
+def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
@@ -937,4 +1018,17 @@ class UncondBranchMMPseudo<string opstr> :
let Predicates = [InMicroMips] in {
def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
+def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+def : MipsInstAlias<"teq $rs, $rt",
+ (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tge $rs, $rt",
+ (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tgeu $rs, $rt",
+ (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tlt $rs, $rt",
+ (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tltu $rs, $rt",
+ (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"tne $rs, $rt",
+ (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
}
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index dbb5f7b71d82..35352b6115c5 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -154,9 +154,14 @@ def FeatureMips16 : SubtargetFeature<"mips16", "InMips16Mode", "true",
def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
"Mips DSP-R2 ASE", [FeatureDSP]>;
+def FeatureDSPR3
+ : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE",
+ [ FeatureDSP, FeatureDSPR2 ]>;
def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
+
def FeatureMicroMips : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
"microMips mode">;
@@ -164,10 +169,19 @@ def FeatureCnMips : SubtargetFeature<"cnmips", "HasCnMips",
"true", "Octeon cnMIPS Support",
[FeatureMips64r2]>;
+def FeatureUseTCCInDIV : SubtargetFeature<
+ "use-tcc-in-div",
+ "UseTCCInDIV", "false",
+ "Force the assembler to use trapping">;
+
//===----------------------------------------------------------------------===//
// Mips processors supported.
//===----------------------------------------------------------------------===//
+def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl",
+ "MipsSubtarget::CPU::P5600",
+ "The P5600 Processor", [FeatureMips32r5]>;
+
class Proc<string Name, list<SubtargetFeature> Features>
: Processor<Name, MipsGenericItineraries, Features>;
@@ -187,12 +201,11 @@ def : Proc<"mips64r2", [FeatureMips64r2]>;
def : Proc<"mips64r3", [FeatureMips64r3]>;
def : Proc<"mips64r5", [FeatureMips64r5]>;
def : Proc<"mips64r6", [FeatureMips64r6]>;
-def : Proc<"mips16", [FeatureMips16]>;
def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>;
+def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>;
def MipsAsmParser : AsmParser {
let ShouldEmitMatchRegisterName = 0;
- let MnemonicContainsDot = 1;
}
def MipsAsmParserVariant : AsmParserVariant {
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 46cc99c62393..26426c087164 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -39,7 +39,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
const Mips16InstrInfo &TII =
*static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
MachineBasicBlock::iterator MBBI = MBB.begin();
- DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+
uint64_t StackSize = MFI->getStackSize();
// No need to allocate space on the stack.
@@ -107,7 +111,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- MachineBasicBlock *EntryBlock = MF->begin();
+ MachineBasicBlock *EntryBlock = &MF->front();
//
// Registers RA, S0,S1 are the callee saved registers and they
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 893fc7cdf473..b2bc7e74c706 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -40,26 +40,17 @@ namespace {
const MipsTargetMachine &TM;
};
- class InlineAsmHelper {
- LLVMContext &C;
- BasicBlock *BB;
- public:
- InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) :
- C(C_), BB(BB_) {
- }
-
- void Out(StringRef AsmString) {
- std::vector<llvm::Type *> AsmArgTypes;
- std::vector<llvm::Value*> AsmArgs;
-
- llvm::FunctionType *AsmFTy = llvm::FunctionType::get(Type::getVoidTy(C),
- AsmArgTypes, false);
- llvm::InlineAsm *IA = llvm::InlineAsm::get(AsmFTy, AsmString, "", true,
- /* IsAlignStack */ false,
- llvm::InlineAsm::AD_ATT);
- CallInst::Create(IA, AsmArgs, "", BB);
- }
- };
+ static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
+ std::vector<llvm::Type *> AsmArgTypes;
+ std::vector<llvm::Value *> AsmArgs;
+
+ llvm::FunctionType *AsmFTy =
+ llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false);
+ llvm::InlineAsm *IA =
+ llvm::InlineAsm::get(AsmFTy, AsmText, "", true,
+ /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT);
+ CallInst::Create(IA, AsmArgs, "", BB);
+ }
char Mips16HardFloat::ID = 0;
}
@@ -182,7 +173,7 @@ static bool needsFPReturnHelper(Function &F) {
return whichFPReturnVariant(RetType) != NoFPRet;
}
-static bool needsFPReturnHelper(const FunctionType &FT) {
+static bool needsFPReturnHelper(FunctionType &FT) {
Type* RetType = FT.getReturnType();
return whichFPReturnVariant(RetType) != NoFPRet;
}
@@ -195,63 +186,72 @@ static bool needsFPHelperFromSig(Function &F) {
// We swap between FP and Integer registers to allow Mips16 and Mips32 to
// interoperate
//
-static void swapFPIntParams(FPParamVariant PV, Module *M, InlineAsmHelper &IAH,
- bool LE, bool ToFP) {
- //LLVMContext &Context = M->getContext();
- std::string MI = ToFP? "mtc1 ": "mfc1 ";
+static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE,
+ bool ToFP) {
+ std::string MI = ToFP ? "mtc1 ": "mfc1 ";
+ std::string AsmText;
+
switch (PV) {
case FSig:
- IAH.Out(MI + "$$4,$$f12");
+ AsmText += MI + "$$4, $$f12\n";
break;
+
case FFSig:
- IAH.Out(MI +"$$4,$$f12");
- IAH.Out(MI + "$$5,$$f14");
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f14\n";
break;
+
case FDSig:
- IAH.Out(MI + "$$4,$$f12");
+ AsmText += MI + "$$4, $$f12\n";
if (LE) {
- IAH.Out(MI + "$$6,$$f14");
- IAH.Out(MI + "$$7,$$f15");
+ AsmText += MI + "$$6, $$f14\n";
+ AsmText += MI + "$$7, $$f15\n";
} else {
- IAH.Out(MI + "$$7,$$f14");
- IAH.Out(MI + "$$6,$$f15");
+ AsmText += MI + "$$7, $$f14\n";
+ AsmText += MI + "$$6, $$f15\n";
}
break;
+
case DSig:
if (LE) {
- IAH.Out(MI + "$$4,$$f12");
- IAH.Out(MI + "$$5,$$f13");
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
} else {
- IAH.Out(MI + "$$5,$$f12");
- IAH.Out(MI + "$$4,$$f13");
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
}
break;
+
case DDSig:
if (LE) {
- IAH.Out(MI + "$$4,$$f12");
- IAH.Out(MI + "$$5,$$f13");
- IAH.Out(MI + "$$6,$$f14");
- IAH.Out(MI + "$$7,$$f15");
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
+ AsmText += MI + "$$6, $$f14\n";
+ AsmText += MI + "$$7, $$f15\n";
} else {
- IAH.Out(MI + "$$5,$$f12");
- IAH.Out(MI + "$$4,$$f13");
- IAH.Out(MI + "$$7,$$f14");
- IAH.Out(MI + "$$6,$$f15");
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
+ AsmText += MI + "$$7, $$f14\n";
+ AsmText += MI + "$$6, $$f15\n";
}
break;
+
case DFSig:
if (LE) {
- IAH.Out(MI + "$$4,$$f12");
- IAH.Out(MI + "$$5,$$f13");
+ AsmText += MI + "$$4, $$f12\n";
+ AsmText += MI + "$$5, $$f13\n";
} else {
- IAH.Out(MI + "$$5,$$f12");
- IAH.Out(MI + "$$4,$$f13");
+ AsmText += MI + "$$5, $$f12\n";
+ AsmText += MI + "$$4, $$f13\n";
}
- IAH.Out(MI + "$$6,$$f14");
+ AsmText += MI + "$$6, $$f14\n";
break;
+
case NoSig:
- return;
+ break;
}
+
+ return AsmText;
}
//
@@ -282,68 +282,77 @@ static void assureFPCallStub(Function &F, Module *M,
FStub->addFnAttr("nomips16");
FStub->setSection(SectionName);
BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
- InlineAsmHelper IAH(Context, BB);
- IAH.Out(".set reorder");
FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
FPParamVariant PV = whichFPParamVariantNeeded(F);
- swapFPIntParams(PV, M, IAH, LE, true);
+
+ std::string AsmText;
+ AsmText += ".set reorder\n";
+ AsmText += swapFPIntParams(PV, M, LE, true);
if (RV != NoFPRet) {
- IAH.Out("move $$18, $$31");
- IAH.Out("jal " + Name);
+ AsmText += "move $$18, $$31\n";
+ AsmText += "jal " + Name + "\n";
} else {
- IAH.Out("lui $$25,%hi(" + Name + ")");
- IAH.Out("addiu $$25,$$25,%lo(" + Name + ")" );
+ AsmText += "lui $$25, %hi(" + Name + ")\n";
+ AsmText += "addiu $$25, $$25, %lo(" + Name + ")\n";
}
+
switch (RV) {
case FRet:
- IAH.Out("mfc1 $$2,$$f0");
+ AsmText += "mfc1 $$2, $$f0\n";
break;
+
case DRet:
if (LE) {
- IAH.Out("mfc1 $$2,$$f0");
- IAH.Out("mfc1 $$3,$$f1");
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f1\n";
} else {
- IAH.Out("mfc1 $$3,$$f0");
- IAH.Out("mfc1 $$2,$$f1");
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$2, $$f1\n";
}
break;
+
case CFRet:
if (LE) {
- IAH.Out("mfc1 $$2,$$f0");
- IAH.Out("mfc1 $$3,$$f2");
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f2\n";
} else {
- IAH.Out("mfc1 $$3,$$f0");
- IAH.Out("mfc1 $$3,$$f2");
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$3, $$f2\n";
}
break;
+
case CDRet:
if (LE) {
- IAH.Out("mfc1 $$4,$$f2");
- IAH.Out("mfc1 $$5,$$f3");
- IAH.Out("mfc1 $$2,$$f0");
- IAH.Out("mfc1 $$3,$$f1");
+ AsmText += "mfc1 $$4, $$f2\n";
+ AsmText += "mfc1 $$5, $$f3\n";
+ AsmText += "mfc1 $$2, $$f0\n";
+ AsmText += "mfc1 $$3, $$f1\n";
} else {
- IAH.Out("mfc1 $$5,$$f2");
- IAH.Out("mfc1 $$4,$$f3");
- IAH.Out("mfc1 $$3,$$f0");
- IAH.Out("mfc1 $$2,$$f1");
+ AsmText += "mfc1 $$5, $$f2\n";
+ AsmText += "mfc1 $$4, $$f3\n";
+ AsmText += "mfc1 $$3, $$f0\n";
+ AsmText += "mfc1 $$2, $$f1\n";
}
break;
+
case NoFPRet:
break;
}
+
if (RV != NoFPRet)
- IAH.Out("jr $$18");
+ AsmText += "jr $$18\n";
else
- IAH.Out("jr $$25");
+ AsmText += "jr $$25\n";
+ EmitInlineAsm(Context, BB, AsmText);
+
new UnreachableInst(Context, BB);
}
//
// Functions that are llvm intrinsics and don't need helpers.
//
-static const char *IntrinsicInline[] = {
+static const char *const IntrinsicInline[] = {
"fabs", "fabsf",
"llvm.ceil.f32", "llvm.ceil.f64",
"llvm.copysign.f32", "llvm.copysign.f64",
@@ -395,7 +404,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
Type *T = RVal->getType();
FPReturnVariant RV = whichFPReturnVariant(T);
if (RV == NoFPRet) continue;
- static const char* Helper[NoFPRet] = {
+ static const char *const Helper[NoFPRet] = {
"__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
"__mips16_ret_dc"
};
@@ -419,11 +428,11 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
CallInst::Create(F, Params, "", &Inst );
} else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
const Value* V = CI->getCalledValue();
- const Type* T = nullptr;
+ Type* T = nullptr;
if (V) T = V->getType();
- const PointerType *PFT=nullptr;
+ PointerType *PFT = nullptr;
if (T) PFT = dyn_cast<PointerType>(T);
- const FunctionType *FT=nullptr;
+ FunctionType *FT = nullptr;
if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
Function *F_ = CI->getCalledFunction();
if (FT && needsFPReturnHelper(*FT) &&
@@ -469,20 +478,21 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
FStub->addFnAttr("nomips16");
FStub->setSection(SectionName);
BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
- InlineAsmHelper IAH(Context, BB);
+
+ std::string AsmText;
if (PicMode) {
- IAH.Out(".set noreorder");
- IAH.Out(".cpload $$25");
- IAH.Out(".set reorder");
- IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
- IAH.Out("la $$25," + LocalName);
- }
- else {
- IAH.Out("la $$25," + Name);
- }
- swapFPIntParams(PV, M, IAH, LE, false);
- IAH.Out("jr $$25");
- IAH.Out(LocalName + " = " + Name);
+ AsmText += ".set noreorder\n";
+ AsmText += ".cpload $$25\n";
+ AsmText += ".set reorder\n";
+ AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n";
+ AsmText += "la $$25, " + LocalName + "\n";
+ } else
+ AsmText += "la $$25, " + Name + "\n";
+ AsmText += swapFPIntParams(PV, M, LE, false);
+ AsmText += "jr $$25\n";
+ AsmText += LocalName + " = " + Name + "\n";
+ EmitInlineAsm(Context, BB, AsmText);
+
new UnreachableInst(FStub->getContext(), BB);
}
@@ -535,7 +545,7 @@ bool Mips16HardFloat::runOnModule(Module &M) {
FPParamVariant V = whichFPParamVariantNeeded(*F);
if (V != NoSig) {
Modified = true;
- createFPFnStub(F, &M, V, TM);
+ createFPFnStub(&*F, &M, V, TM);
}
}
return Modified;
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index bce2c1eb4485..5a1c2c67cc70 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -73,7 +73,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
MachineBasicBlock::iterator I = MBB.begin();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 3522cbb1f36a..e7483253e61d 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -530,8 +530,7 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -592,8 +591,7 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI,
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -657,8 +655,7 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index a49572efdbf9..da8ada4e5391 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -196,7 +196,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo *MFI = MF.getFrameInfo();
const BitVector Reserved = RI.getReservedRegs(MF);
@@ -263,7 +263,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned Reg1, unsigned Reg2) const {
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
//
// li reg1, constant
// move reg2, sp
@@ -446,7 +446,7 @@ const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
void Mips16InstrInfo::BuildAddiuSpImm
(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
}
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 10fff03b7240..dad6ea4c9e98 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -530,19 +530,19 @@ class MayStore {
// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
// To add a constant to a 32-bit integer.
//
-def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>;
-def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>,
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>,
ArithLogic16Defs<0> {
let AddedComplexity = 5;
}
-def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>,
ArithLogic16Defs<0> {
let isCodeGenOnly = 1;
}
def AddiuRxRyOffMemX16:
- FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
+ FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>;
//
@@ -550,7 +550,7 @@ def AddiuRxRyOffMemX16:
// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
// To add a constant to the program counter.
//
-def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>;
//
// Format: ADDIU sp, immediate MIPS16e
@@ -558,14 +558,14 @@ def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
// To add a constant to the stack pointer.
//
def AddiuSpImm16
- : FI816_SP_ins<0b011, "addiu", IIAlu> {
+ : FI816_SP_ins<0b011, "addiu", IIM16Alu> {
let Defs = [SP];
let Uses = [SP];
let AddedComplexity = 5;
}
def AddiuSpImmX16
- : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> {
+ : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> {
let Defs = [SP];
let Uses = [SP];
}
@@ -576,14 +576,14 @@ def AddiuSpImmX16
// To add 32-bit integers.
//
-def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>;
//
// Format: AND rx, ry MIPS16e
// Purpose: AND
// To do a bitwise logical AND.
-def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>;
//
@@ -591,7 +591,7 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
// Purpose: Branch on Equal to Zero
// To test a GPR then do a PC-relative conditional branch.
//
-def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
//
@@ -599,7 +599,7 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
// Purpose: Branch on Equal to Zero (Extended)
// To test a GPR then do a PC-relative conditional branch.
//
-def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
//
// Format: B offset MIPS16e
@@ -607,27 +607,27 @@ def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
// To do an unconditional PC-relative branch.
//
-def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16;
// Format: B offset MIPS16e
// Purpose: Unconditional Branch
// To do an unconditional PC-relative branch.
//
-def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16;
//
// Format: BNEZ rx, offset MIPS16e
// Purpose: Branch on Not Equal to Zero
// To test a GPR then do a PC-relative conditional branch.
//
-def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
//
// Format: BNEZ rx, offset MIPS16e
// Purpose: Branch on Not Equal to Zero (Extended)
// To test a GPR then do a PC-relative conditional branch.
//
-def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
//
@@ -641,11 +641,11 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>;
// Purpose: Branch on T Equal to Zero (Extended)
// To test special register T then do a PC-relative conditional branch.
//
-def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
let Uses = [T8];
}
-def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
let Uses = [T8];
}
@@ -669,11 +669,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
// To test special register T then do a PC-relative conditional branch.
//
-def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 {
let Uses = [T8];
}
-def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 {
let Uses = [T8];
}
@@ -695,7 +695,7 @@ def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
// Purpose: Compare
// To compare the contents of two GPRs.
//
-def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> {
let Defs = [T8];
}
@@ -704,7 +704,7 @@ def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
// Purpose: Compare Immediate
// To compare a constant with the contents of a GPR.
//
-def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> {
let Defs = [T8];
}
@@ -713,7 +713,7 @@ def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
// Purpose: Compare Immediate (Extended)
// To compare a constant with the contents of a GPR.
//
-def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> {
let Defs = [T8];
}
@@ -723,7 +723,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
// Purpose: Divide Word
// To divide 32-bit signed integers.
//
-def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> {
let Defs = [HI0, LO0];
}
@@ -732,7 +732,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
// Purpose: Divide Unsigned Word
// To divide 32-bit unsigned integers.
//
-def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> {
let Defs = [HI0, LO0];
}
//
@@ -742,13 +742,13 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
// region and preserve the current ISA.
//
-def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
+def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> {
let hasDelaySlot = 0; // not true, but we add the nop for now
let isCall=1;
let Defs = [RA];
}
-def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
+def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 {
let hasDelaySlot = 0; // not true, but we add the nop for now
let isBranch=1;
let Defs = [RA];
@@ -761,7 +761,7 @@ def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
// address register.
//
-def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
let isBranch = 1;
let isIndirectBranch = 1;
let hasDelaySlot = 1;
@@ -769,14 +769,14 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
let isBarrier=1;
}
-def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> {
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
let isBranch = 1;
let isIndirectBranch = 1;
let isTerminator=1;
let isBarrier=1;
}
-def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
let isBranch = 1;
let isIndirectBranch = 1;
let isTerminator=1;
@@ -825,16 +825,16 @@ def LhuRxRyOffMemX16:
// Purpose: Load Immediate
// To load a constant into a GPR.
//
-def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
+def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>;
//
// Format: LI rx, immediate MIPS16e
// Purpose: Load Immediate (Extended)
// To load a constant into a GPR.
//
-def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>;
-def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> {
let isCodeGenOnly = 1;
}
@@ -863,21 +863,21 @@ def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
// Purpose: Move
// To move the contents of a GPR to a GPR.
//
-def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
//
// Format: MOVE ry, r32 MIPS16e
//Purpose: Move
// To move the contents of a GPR to a GPR.
//
-def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
//
// Format: MFHI rx MIPS16e
// Purpose: Move From HI Register
// To copy the special purpose HI register to a GPR.
//
-def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
let Uses = [HI0];
let hasSideEffects = 0;
}
@@ -887,7 +887,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
// Purpose: Move From LO Register
// To copy the special purpose LO register to a GPR.
//
-def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
let Uses = [LO0];
let hasSideEffects = 0;
}
@@ -895,13 +895,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
//
// Pseudo Instruction for mult
//
-def MultRxRy16: FMULT16_ins<"mult", IIAlu> {
+def MultRxRy16: FMULT16_ins<"mult", IIM16Alu> {
let isCommutable = 1;
let hasSideEffects = 0;
let Defs = [HI0, LO0];
}
-def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
+def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
let isCommutable = 1;
let hasSideEffects = 0;
let Defs = [HI0, LO0];
@@ -912,7 +912,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
// Purpose: Multiply Word
// To multiply 32-bit signed integers.
//
-def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
let isCommutable = 1;
let hasSideEffects = 0;
let Defs = [HI0, LO0];
@@ -923,7 +923,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
// Purpose: Multiply Unsigned Word
// To multiply 32-bit unsigned integers.
//
-def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> {
let isCommutable = 1;
let hasSideEffects = 0;
let Defs = [HI0, LO0];
@@ -934,21 +934,21 @@ def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
// Purpose: Negate
// To negate an integer value.
//
-def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>;
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>;
//
// Format: NOT rx, ry MIPS16e
// Purpose: Not
// To complement an integer value
//
-def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>;
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>;
//
// Format: OR rx, ry MIPS16e
// Purpose: Or
// To do a bitwise logical OR.
//
-def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>;
//
// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
@@ -1012,7 +1012,7 @@ def SbRxRyOffMemX16:
// Sign-extend least significant byte in register rx.
//
def SebRx16
- : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+ : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>;
//
// Format: SEH rx MIPS16e
@@ -1020,7 +1020,7 @@ def SebRx16
// Sign-extend least significant word in register rx.
//
def SehRx16
- : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+ : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>;
//
// The Sel(T) instructions are pseudos
@@ -1149,21 +1149,21 @@ def ShRxRyOffMemX16:
// Purpose: Shift Word Left Logical (Extended)
// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
//
-def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>;
//
// Format: SLLV ry, rx MIPS16e
// Purpose: Shift Word Left Logical Variable
// To execute a left-shift of a word by a variable number of bits.
//
-def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>;
// Format: SLTI rx, immediate MIPS16e
// Purpose: Set on Less Than Immediate
// To record the result of a less-than comparison with a constant.
//
//
-def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> {
let Defs = [T8];
}
@@ -1173,7 +1173,7 @@ def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
// To record the result of a less-than comparison with a constant.
//
//
-def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> {
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> {
let Defs = [T8];
}
@@ -1184,7 +1184,7 @@ def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
// To record the result of a less-than comparison with a constant.
//
//
-def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> {
let Defs = [T8];
}
@@ -1194,7 +1194,7 @@ def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
// To record the result of a less-than comparison with a constant.
//
//
-def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> {
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> {
let Defs = [T8];
}
//
@@ -1209,7 +1209,7 @@ def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
// Purpose: Set on Less Than
// To record the result of a less-than comparison.
//
-def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{
let Defs = [T8];
}
@@ -1219,7 +1219,7 @@ def SltCCRxRy16: FCCRR16_ins<"slt">;
// Purpose: Set on Less Than Unsigned
// To record the result of an unsigned less-than comparison.
//
-def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{
let Defs = [T8];
}
@@ -1236,7 +1236,7 @@ def SltuCCRxRy16: FCCRR16_ins<"sltu">;
// To execute an arithmetic right-shift of a word by a variable
// number of bits.
//
-def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>;
//
@@ -1245,7 +1245,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
// To execute an arithmetic right-shift of a word by a fixed
// number of bits-1 to 8 bits.
//
-def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>;
//
@@ -1254,7 +1254,7 @@ def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
// To execute a logical right-shift of a word by a variable
// number of bits.
//
-def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>;
//
@@ -1263,14 +1263,14 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
// To execute a logical right-shift of a word by a fixed
// number of bits-1 to 31 bits.
//
-def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>;
//
// Format: SUBU rz, rx, ry MIPS16e
// Purpose: Subtract Unsigned Word
// To subtract 32-bit integers
//
-def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>;
//
// Format: SW ry, offset(rx) MIPS16e
@@ -1294,7 +1294,7 @@ def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
// Purpose: Xor
// To do a bitwise logical XOR.
//
-def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>;
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>;
class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
let Predicates = [InMips16Mode];
@@ -1380,7 +1380,7 @@ def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
let isCall=1, hasDelaySlot=0 in
def JumpLinkReg16:
FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
- "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> {
+ "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> {
let Defs = [RA];
}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index d6ab8a6e5411..82d2c8ee9905 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -186,54 +186,56 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
RegisterOperand FGROpnd>{
- def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
- CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
- CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
- CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
- CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+ let AdditionalPredicates = [NotInMicroMips] in {
+ def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+ CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
ISA_MIPS32R6, HARDFLOAT;
- def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
- CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
- CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
- CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
- CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
- CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
- CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
- CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
- CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+ def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+ CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
ISA_MIPS32R6, HARDFLOAT;
- def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
- CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
- CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+ def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+ CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
ISA_MIPS32R6, HARDFLOAT;
- def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
- CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
- ISA_MIPS32R6, HARDFLOAT;
- def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
- CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+ def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+ CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+ CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
ISA_MIPS32R6, HARDFLOAT;
+ def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+ CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+ CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+ CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
+ CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
+ CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+ CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
+ CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
+ CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
+ CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
+ CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
+ CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+ ISA_MIPS32R6, HARDFLOAT;
+ }
}
//===----------------------------------------------------------------------===//
@@ -557,7 +559,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
list<dag> Pattern = [];
- string DecoderMethod = "DecodeCacheOpR6";
+ string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
}
class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
@@ -595,7 +597,7 @@ class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
list<dag> Pattern = [];
}
-class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>;
class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
dag OutOperandList = (outs GPROpnd:$rt);
@@ -685,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
@@ -702,39 +706,51 @@ def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
-def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
-def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def MUH : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
def MUHU : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
def NAL; // BAL with rd=0
def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
-def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+}
def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+ def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
//===----------------------------------------------------------------------===//
@@ -743,7 +759,9 @@ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
//
//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [NotInMicroMips] in {
def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+}
def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
//===----------------------------------------------------------------------===//
@@ -752,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
//
//===----------------------------------------------------------------------===//
-// f32 comparisons supported via another comparison
-def : MipsPat<(setone f32:$lhs, f32:$rhs),
- (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f32:$lhs, f32:$rhs),
- (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f32:$lhs, f32:$rhs),
- (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setne f32:$lhs, f32:$rhs),
- (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-
-// f64 comparisons supported via another comparison
-def : MipsPat<(setone f64:$lhs, f64:$rhs),
- (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f64:$lhs, f64:$rhs),
- (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f64:$lhs, f64:$rhs),
- (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
- ISA_MIPS32R6;
-def : MipsPat<(setne f64:$lhs, f64:$rhs),
- (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+ (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+ (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
// i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+ Instruction SLTiOp, Instruction SLTiuOp,
+ Instruction SELEQZOp, Instruction SELNEZOp,
+ SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+ (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+ (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+ (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+ (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+ (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+ (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+ (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+ (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+ (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+ (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+ (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+ immZExt16, i32>, ISA_MIPS32R6;
+
def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
- (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
- ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
- (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
- ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
- (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
- ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
- (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
- (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+ (OR (SELNEZ i32:$t, i32:$cond),
+ (SELEQZ i32:$f, i32:$cond))>,
ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
- (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
- (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
- ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
- i32:$f),
- (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
- (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
- ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
- i32:$t, i32:$f),
- (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
- (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
- ISA_MIPS32R6;
-
def : MipsPat<(select i32:$cond, i32:$t, immz),
- (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
- (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
- (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+ (SELNEZ i32:$t, i32:$cond)>,
+ ISA_MIPS32R6;
def : MipsPat<(select i32:$cond, immz, i32:$f),
- (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
- (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
- (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+ (SELEQZ i32:$f, i32:$cond)>,
+ ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index f917ecad4a53..cbdcdd788bec 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -16,10 +16,6 @@
//===----------------------------------------------------------------------===//
// Unsigned Operand
-def uimm5_64 : Operand<i64> {
- let PrintMethod = "printUnsignedImm";
-}
-
def uimm16_64 : Operand<i64> {
let PrintMethod = "printUnsignedImm";
}
@@ -276,12 +272,20 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
let isCodeGenOnly = 1 in
def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
-def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
-def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
-def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+ // TODO: Add 'pos + size' constraint check to dext* instructions
+ // DEXT: 0 < pos + size <= 63
+ // DEXTM, DEXTU: 32 < pos + size <= 64
+ def DEXT : ExtBase<"dext", GPR64Opnd, uimm5, uimm5_plus1, MipsExt>,
+ EXT_FM<3>;
+ def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, MipsExt>,
+ EXT_FM<1>;
+ def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1,
+ MipsExt>, EXT_FM<2>;
+}
def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
-def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>;
def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
@@ -341,11 +345,11 @@ class SetCC64_I<string opstr, PatFrag cond_op>:
}
class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
- RegisterOperand RO, bits<64> shift = 1> :
- InstSE<(outs), (ins RO:$rs, uimm5_64:$p, opnd:$offset),
+ RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> :
+ InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset),
!strconcat(opstr, "\t$rs, $p, $offset"),
[(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)),
- bb:$offset)], IIBranch, FrmI, opstr> {
+ bb:$offset)], II_BBIT, FrmI, opstr> {
let isBranch = 1;
let isTerminator = 1;
let hasDelaySlot = 1;
@@ -363,14 +367,17 @@ def BADDu : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
ADD_FM<0x1c, 0x28>;
// Branch on Bit Clear /+32
-def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd>, BBIT_FM<0x32>;
-def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, 0x100000000>,
+def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd,
+ uimm5_64_report_uimm6>, BBIT_FM<0x32>;
+def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64,
+ 0x100000000>,
BBIT_FM<0x36>;
// Branch on Bit Set /+32
-def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd>, BBIT_FM<0x3a>;
-def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, 0x100000000>,
- BBIT_FM<0x3e>;
+def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd,
+ uimm5_64_report_uimm6>, BBIT_FM<0x3a>;
+def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64,
+ 0x100000000>, BBIT_FM<0x3e>;
// Multiply Doubleword to GPR
let Defs = [HI0, LO0, P0, P1, P2] in
@@ -544,10 +551,25 @@ def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst
(BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>;
}
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+
//===----------------------------------------------------------------------===//
// Instruction aliases
//===----------------------------------------------------------------------===//
def : MipsInstAlias<"move $dst, $src",
+ (OR64 GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>,
+ GPR_64;
+def : MipsInstAlias<"move $dst, $src",
(DADDu GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>,
GPR_64;
def : MipsInstAlias<"daddu $rs, $rt, $imm",
@@ -617,6 +639,38 @@ def : MipsInstAlias<"syncw", (SYNC 0x4), 0>;
def : MipsInstAlias<"syncws", (SYNC 0x5), 0>;
}
+// cnMIPS Aliases.
+
+// bbit* with $p 32-63 converted to bbit*32 with $p 0-31
+def : MipsInstAlias<"bbit0 $rs, $p, $offset",
+ (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+ brtarget:$offset), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"bbit1 $rs, $p, $offset",
+ (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+ brtarget:$offset), 0>,
+ ASE_CNMIPS;
+
+// exts with $pos 32-63 in converted to exts32 with $pos 0-31
+def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
+ (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"exts $rt, $pos, $lenm1",
+ (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+
+// cins with $pos 32-63 in converted to cins32 with $pos 0-31
+def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
+ (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+def : MipsInstAlias<"cins $rt, $pos, $lenm1",
+ (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+ uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+ ASE_CNMIPS;
+
//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions
//===----------------------------------------------------------------------===//
@@ -625,3 +679,8 @@ class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> :
MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
!strconcat(instr_asm, "\t$rt, $imm64")> ;
def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>;
+
+def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
+ "dla\t$rt, $addr">;
+def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
+ "dla\t$rt, $imm64">;
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 6b546e864bd3..6f34dbe28d30 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -62,7 +62,7 @@ class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
-class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1>;
class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
@@ -81,10 +81,12 @@ class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
//
//===----------------------------------------------------------------------===//
-def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
-def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
-def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
-def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+ def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+ def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+ def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+}
def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index fdba064b5c5e..957529376b37 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -169,12 +169,12 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (MCPE.isMachineConstantPoolEntry())
EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
else
- EmitGlobalConstant(MCPE.Val.ConstVal);
+ EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
return;
}
- MachineBasicBlock::const_instr_iterator I = MI;
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
do {
@@ -202,7 +202,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
llvm_unreachable("Pseudo opcode found in EmitInstruction()");
MCInst TmpInst0;
- MCInstLowering.Lower(I, TmpInst0);
+ MCInstLowering.Lower(&*I, TmpInst0);
EmitToStreamer(*OutStreamer, TmpInst0);
} while ((++I != E) && I->isInsideBundle()); // Delay slot check
}
@@ -405,7 +405,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
// If this is a landing pad, it isn't a fall through. If it has no preds,
// then nothing falls through to it.
- if (MBB->isLandingPad() || MBB->pred_empty())
+ if (MBB->isEHPad() || MBB->pred_empty())
return false;
// If there isn't exactly one predecessor, it can't be a fall through.
@@ -559,7 +559,6 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
raw_ostream &O) {
- const DataLayout *DL = TM.getDataLayout();
const MachineOperand &MO = MI->getOperand(opNum);
bool closeP = false;
@@ -608,7 +607,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
}
case MachineOperand::MO_ConstantPoolIndex:
- O << DL->getPrivateGlobalPrefix() << "CPI"
+ O << getDataLayout().getPrivateGlobalPrefix() << "CPI"
<< getFunctionNumber() << "_" << MO.getIndex();
if (MO.getOffset())
O << "+" << MO.getOffset();
@@ -1009,7 +1008,7 @@ void MipsAsmPrinter::EmitFPCallStub(
//
// Mov $18, $31
- EmitInstrRegRegReg(*STI, Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
+ EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO);
EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true);
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index b8081295ca64..d82063e3d2a9 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -29,22 +29,16 @@ static bool isF128SoftLibCall(const char *CallSym) {
"powl", "rintl", "sinl", "sqrtl",
"truncl"};
- const char *const *End = LibCalls + array_lengthof(LibCalls);
-
// Check that LibCalls is sorted alphabetically.
- MipsTargetLowering::LTStr Comp;
-
-#ifndef NDEBUG
- for (const char *const *I = LibCalls; I < End - 1; ++I)
- assert(Comp(*I, *(I + 1)));
-#endif
-
- return std::binary_search(LibCalls, End, CallSym, Comp);
+ auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
+ assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp));
+ return std::binary_search(std::begin(LibCalls), std::end(LibCalls),
+ CallSym, Comp);
}
/// This function returns true if Ty is fp128, {f128} or i128 which was
/// originally a fp128.
-static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
if (Ty->isFP128Ty())
return true;
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 93e1908083cb..0b4b7785af67 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -427,3 +427,28 @@ def CSR_Mips16RetHelper :
CalleeSavedRegs<(add V0, V1, FP,
(sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
(sequence "D%u", 15, 10))>;
+
+def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+ (sequence "S%u", 7, 0),
+ (sequence "V%u", 1, 0),
+ (sequence "T%u", 9, 0),
+ RA, FP, GP, AT)>;
+
+def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+ (sequence "S%u", 7, 0),
+ (sequence "V%u", 1, 0),
+ (sequence "T%u", 9, 0),
+ RA, FP, GP, AT, LO0, HI0)>;
+
+def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+ (sequence "V%u_64", 1, 0),
+ (sequence "S%u_64", 7, 0),
+ (sequence "T%u_64", 9, 0),
+ RA_64, FP_64, GP_64, AT_64)>;
+
+def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+ (sequence "S%u_64", 7, 0),
+ (sequence "T%u_64", 9, 0),
+ (sequence "V%u_64", 1, 0),
+ RA_64, FP_64, GP_64, AT_64,
+ LO0_64, HI0_64)>;
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index 96553d28fc57..ea8c5871fa0e 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -560,7 +560,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
// identity mapping of CPI's to CPE's.
const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
- const DataLayout &TD = *MF->getTarget().getDataLayout();
+ const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
assert(Size >= 4 && "Too small constant pool entry");
@@ -598,12 +598,12 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
/// into the block immediately after it.
static bool BBHasFallthrough(MachineBasicBlock *MBB) {
// Get the next machine basic block in the function.
- MachineFunction::iterator MBBI = MBB;
+ MachineFunction::iterator MBBI = MBB->getIterator();
// Can't fall off end of function.
if (std::next(MBBI) == MBB->getParent()->end())
return false;
- MachineBasicBlock *NextBB = std::next(MBBI);
+ MachineBasicBlock *NextBB = &*std::next(MBBI);
for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
E = MBB->succ_end(); I != E; ++I)
if (*I == NextBB)
@@ -656,11 +656,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
// alignment assumptions, as we don't know for sure the size of any
// instructions in the inline assembly.
for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
- computeBlockSize(I);
+ computeBlockSize(&*I);
// Compute block offsets.
- adjustBBOffsetsAfter(MF->begin());
+ adjustBBOffsetsAfter(&MF->front());
// Now go back through the instructions and build up our data structures.
for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
@@ -879,7 +879,7 @@ MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
- MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+ MachineFunction::iterator MBBI = ++OrigBB->getIterator();
MF->insert(MBBI, NewBB);
// Splice the instructions starting with MI over to NewBB.
@@ -967,8 +967,8 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
unsigned NextBlockOffset, NextBlockAlignment;
- MachineFunction::const_iterator NextBlock = Water;
- if (++NextBlock == MF->end()) {
+ MachineFunction::const_iterator NextBlock = ++Water->getIterator();
+ if (NextBlock == MF->end()) {
NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
NextBlockAlignment = 0;
} else {
@@ -1261,7 +1261,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
<< format(", expected CPE offset %#x\n", CPEOffset));
- NewMBB = std::next(MachineFunction::iterator(UserMBB));
+ NewMBB = &*++UserMBB->getIterator();
// Add an unconditional branch from UserMBB to fallthrough block. Record
// it for branch lengthening; this new branch will not get out of range,
// but if the preceding conditional branch is out of range, the targets
@@ -1371,8 +1371,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
NewWaterList.insert(NewIsland);
// The new CPE goes before the following block (NewMBB).
- NewMBB = std::next(MachineFunction::iterator(WaterBB));
-
+ NewMBB = &*++WaterBB->getIterator();
} else {
// No water found.
// we first see if a longer form of the instrucion could have reached
@@ -1389,7 +1388,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
// next iteration for constant pools, but in this context, we don't want
// it. Check for this so it will be removed from the WaterList.
// Also remove any entry from NewWaterList.
- MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
+ MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
if (IP != WaterList.end())
NewWaterList.erase(WaterBB);
@@ -1406,7 +1405,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
WaterList.erase(IP);
// Okay, we know we can put an island before NewMBB now, do it!
- MF->insert(NewMBB, NewIsland);
+ MF->insert(NewMBB->getIterator(), NewIsland);
// Update internal data structures to account for the newly inserted MBB.
updateForInsertedWaterBlock(NewIsland);
@@ -1431,9 +1430,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
// Increase the size of the island block to account for the new entry.
BBInfo[NewIsland->getNumber()].Size += Size;
- adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
-
-
+ adjustBBOffsetsAfter(&*--NewIsland->getIterator());
// Finally, change the CPI in the instruction operand to be ID.
for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1645,7 +1642,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
MBB->back().eraseFromParent();
// BBInfo[SplitBB].Offset is wrong temporarily, fixed below
}
- MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+ MachineBasicBlock *NextBB = &*++MBB->getIterator();
DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
<< " also invert condition and change dest. to BB#"
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index b5d52ced9d3d..f959bd4d8db3 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -7,10 +7,30 @@
//
//===----------------------------------------------------------------------===//
+class DspMMRel;
+
+def Dsp2MicroMips : InstrMapping {
+ let FilterClass = "DspMMRel";
+ // Instructions with the same BaseOpcode and isNVStore values form a row.
+ let RowFields = ["BaseOpcode"];
+ // Instructions with the same predicate sense form a column.
+ let ColFields = ["Arch"];
+ // The key column is the unpredicated instructions.
+ let KeyCol = ["dsp"];
+ // Value columns are PredSense=true and PredSense=false
+ let ValueCols = [["dsp"], ["mmdsp"]];
+}
+
def HasDSP : Predicate<"Subtarget->hasDSP()">,
AssemblerPredicate<"FeatureDSP">;
def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
AssemblerPredicate<"FeatureDSPR2">;
+def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
+ AssemblerPredicate<"FeatureDSPR3">;
+
+class ISA_DSPR2 {
+ list<Predicate> InsnPredicates = [HasDSPR2];
+}
// Fields.
class Field6<bits<6> val> {
@@ -20,14 +40,22 @@ class Field6<bits<6> val> {
def SPECIAL3_OPCODE : Field6<0b011111>;
def REGIMM_OPCODE : Field6<0b000001>;
-class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
- let Predicates = [HasDSP];
+class DSPInst<string opstr = "">
+ : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+ string BaseOpcode = opstr;
+ string Arch = "dsp";
}
class PseudoDSP<dag outs, dag ins, list<dag> pattern,
- InstrItinClass itin = IIPseudo>:
- MipsPseudo<outs, ins, pattern, itin> {
- let Predicates = [HasDSP];
+ InstrItinClass itin = IIPseudo>
+ : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+ let InsnPredicates = [HasDSP];
+}
+
+class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+ : InstAlias<Asm, Result, Emit>, PredicateControl {
+ let InsnPredicates = [HasDSP];
}
// ADDU.QB sub-class format.
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index d26838404451..da6f174e2a19 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -12,9 +12,11 @@
//===----------------------------------------------------------------------===//
// ImmLeaf
+def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt7 : ImmLeaf<i32, [{return isUInt<7>(Imm);}]>;
def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
@@ -263,6 +265,7 @@ class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -273,6 +276,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -293,6 +297,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -304,6 +309,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
InstrItinClass Itinerary = itin;
string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
}
class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -314,6 +320,7 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -323,6 +330,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -332,17 +340,19 @@ class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
SDPatternOperator ImmPat, InstrItinClass itin,
- RegisterOperand RO> {
+ RegisterOperand RO, Operand ImmOpnd> {
dag OutOperandList = (outs RO:$rd);
- dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
+ dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa);
string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
InstrItinClass Itinerary = itin;
bit hasSideEffects = 1;
+ string BaseOpcode = instr_asm;
}
class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -353,6 +363,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
InstrItinClass Itinerary = itin;
bit mayLoad = 1;
+ string BaseOpcode = instr_asm;
}
class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -363,17 +374,19 @@ class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- SDPatternOperator ImmOp, InstrItinClass itin> {
+ Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> {
dag OutOperandList = (outs GPR32Opnd:$rt);
- dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
+ dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src);
string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
list<dag> Pattern = [(set GPR32Opnd:$rt,
- (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
+ (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
InstrItinClass Itinerary = itin;
string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
}
class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -382,6 +395,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -390,15 +404,17 @@ class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
dag OutOperandList = (outs ACC64DSPOpnd:$ac);
- dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
+ dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin);
string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
(OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
}
class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -408,6 +424,7 @@ class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
(OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
}
class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -417,6 +434,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
(OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
}
class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -426,15 +444,17 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
InstrItinClass itin> {
dag OutOperandList = (outs);
- dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
+ dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -444,6 +464,7 @@ class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
(OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
}
class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -454,6 +475,7 @@ class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
InstrItinClass Itinerary = itin;
bit isCommutable = 1;
+ string BaseOpcode = instr_asm;
}
class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -465,6 +487,7 @@ class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
(OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
InstrItinClass Itinerary = itin;
string Constraints = "$acin = $ac";
+ string BaseOpcode = instr_asm;
}
class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
@@ -474,6 +497,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -481,6 +505,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
dag InOperandList = (ins GPR32Opnd:$rs);
string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
InstrItinClass Itinerary = itin;
+ string BaseOpcode = instr_asm;
}
class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -506,6 +531,7 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
InstrItinClass Itinerary = itin;
string Constraints = "$src = $rt";
+ string BaseOpcode = instr_asm;
}
//===----------------------------------------------------------------------===//
@@ -639,7 +665,7 @@ class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
// Shift
class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
- NoItinerary, DSPROpnd>,
+ NoItinerary, DSPROpnd, uimm3>,
Defs<[DSPOutFlag22]>;
class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
@@ -647,13 +673,13 @@ class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
Defs<[DSPOutFlag22]>;
class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
- NoItinerary, DSPROpnd>;
+ NoItinerary, DSPROpnd, uimm3>;
class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
NoItinerary, DSPROpnd>;
class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
- NoItinerary, DSPROpnd>,
+ NoItinerary, DSPROpnd, uimm4>,
Defs<[DSPOutFlag22]>;
class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
@@ -661,7 +687,8 @@ class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
Defs<[DSPOutFlag22]>;
class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
- immZExt4, NoItinerary, DSPROpnd>,
+ immZExt4, NoItinerary, DSPROpnd,
+ uimm4>,
Defs<[DSPOutFlag22]>;
class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
@@ -669,19 +696,21 @@ class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
Defs<[DSPOutFlag22]>;
class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
- NoItinerary, DSPROpnd>;
+ NoItinerary, DSPROpnd, uimm4>;
class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
NoItinerary, DSPROpnd>;
class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
- immZExt4, NoItinerary, DSPROpnd>;
+ immZExt4, NoItinerary, DSPROpnd,
+ uimm4>;
class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
NoItinerary, DSPROpnd>;
class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
- immZExt5, NoItinerary, GPR32Opnd>,
+ immZExt5, NoItinerary, GPR32Opnd,
+ uimm5>,
Defs<[DSPOutFlag22]>;
class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
@@ -689,7 +718,8 @@ class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
Defs<[DSPOutFlag22]>;
class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
- immZExt5, NoItinerary, GPR32Opnd>;
+ immZExt5, NoItinerary, GPR32Opnd,
+ uimm5>;
class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
NoItinerary, GPR32Opnd>;
@@ -1039,32 +1069,33 @@ class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
// Shift
class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
- NoItinerary, DSPROpnd>;
+ NoItinerary, DSPROpnd, uimm3>;
class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
NoItinerary, DSPROpnd>;
class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
- immZExt3, NoItinerary, DSPROpnd>;
+ immZExt3, NoItinerary, DSPROpnd,
+ uimm3>;
class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
NoItinerary, DSPROpnd>;
class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
- NoItinerary, DSPROpnd>;
+ NoItinerary, DSPROpnd, uimm4>;
class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
NoItinerary, DSPROpnd>;
// Misc
-class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
NoItinerary>;
-class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
NoItinerary>;
-class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
- NoItinerary>;
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
+ immZExt5, NoItinerary>;
// Pseudos.
def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
@@ -1072,80 +1103,80 @@ def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
// Instruction defs.
// MIPS DSP Rev 1
-def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC;
-def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC;
-def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC;
-def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC;
-def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC;
-def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
-def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC;
-def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
-def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC;
-def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC;
-def ADDSC : ADDSC_ENC, ADDSC_DESC;
-def ADDWC : ADDWC_ENC, ADDWC_DESC;
+def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
+def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
def MODSUB : MODSUB_ENC, MODSUB_DESC;
-def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
-def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
-def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC;
-def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
-def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
-def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
-def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
-def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
-def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
-def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
-def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
-def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
-def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
-def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
-def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
-def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
-def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
-def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC;
-def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC;
-def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC;
-def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC;
-def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC;
-def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC;
-def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC;
-def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
-def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC;
-def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC;
-def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC;
-def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
-def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC;
-def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC;
-def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC;
-def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC;
-def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
-def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
-def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
-def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
-def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
-def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
-def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
-def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
-def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
-def MFHI_DSP : MFHI_ENC, MFHI_DESC;
-def MFLO_DSP : MFLO_ENC, MFLO_DESC;
-def MTHI_DSP : MTHI_ENC, MTHI_DESC;
-def MTLO_DSP : MTLO_ENC, MTLO_DESC;
-def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
-def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
-def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
-def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
-def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
-def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
-def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
-def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
-def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC;
-def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC;
-def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC;
-def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC;
-def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC;
-def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
+def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC;
def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
@@ -1156,87 +1187,85 @@ def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
def BITREV : BITREV_ENC, BITREV_DESC;
-def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
-def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
-def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
-def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
-def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
-def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
-def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
-def LWX : LWX_ENC, LWX_DESC;
-def LHX : LHX_ENC, LHX_DESC;
-def LBUX : LBUX_ENC, LBUX_DESC;
+def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
-def INSV : INSV_ENC, INSV_DESC;
-def EXTP : EXTP_ENC, EXTP_DESC;
-def EXTPV : EXTPV_ENC, EXTPV_DESC;
-def EXTPDP : EXTPDP_ENC, EXTPDP_DESC;
-def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC;
-def EXTR_W : EXTR_W_ENC, EXTR_W_DESC;
-def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC;
-def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC;
-def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC;
-def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC;
-def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
-def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC;
-def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC;
-def SHILO : SHILO_ENC, SHILO_DESC;
-def SHILOV : SHILOV_ENC, SHILOV_DESC;
-def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
-def RDDSP : RDDSP_ENC, RDDSP_DESC;
-def WRDSP : WRDSP_ENC, WRDSP_DESC;
+def INSV : DspMMRel, INSV_ENC, INSV_DESC;
+def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
+def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC;
+def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def WRDSP : WRDSP_ENC, WRDSP_DESC;
+}
// MIPS DSP Rev 2
-let Predicates = [HasDSPR2] in {
-
-def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC;
-def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC;
-def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC;
-def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC;
-def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC;
-def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC;
-def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC;
-def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC;
-def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC;
-def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC;
-def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC;
-def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC;
-def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC;
-def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC;
-def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC;
-def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC;
-def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC;
-def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC;
-def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC;
-def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC;
-def MUL_PH : MUL_PH_ENC, MUL_PH_DESC;
-def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC;
-def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC;
-def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC;
-def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC;
-def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC;
-def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC;
-def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC;
-def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC;
-def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC;
-def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC;
-def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
-def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
-def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
-def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
-def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
-def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
-def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC;
-def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC;
-def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC;
-def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC;
-def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC;
-def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC;
-def APPEND : APPEND_ENC, APPEND_DESC;
-def BALIGN : BALIGN_ENC, BALIGN_DESC;
-def PREPEND : PREPEND_ENC, PREPEND_DESC;
-
-}
+def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
+def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
+def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
+def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2;
+def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2;
+def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2;
+def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2;
+def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2;
+def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2;
+def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2;
+def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2;
+def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2;
+def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2;
+def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2;
+def APPEND : APPEND_ENC, APPEND_DESC, ISA_DSPR2;
+def BALIGN : BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2;
// Pseudos.
let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -1415,3 +1444,8 @@ let AddedComplexity = 20 in {
def : IndexedLoadPat<sextloadi16, LHX>;
def : IndexedLoadPat<load, LWX>;
}
+
+// Instruction alias.
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>;
+}
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 4faeb3321621..8313d909df2a 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -355,9 +355,8 @@ void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
SE = MBB.succ_end(); SI != SE; ++SI)
if (*SI != &SuccBB)
- for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
- LE = (*SI)->livein_end(); LI != LE; ++LI)
- Uses.set(*LI);
+ for (const auto &LI : (*SI)->liveins())
+ Uses.set(LI.PhysReg);
}
bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
@@ -431,7 +430,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
(*MI.memoperands_begin())->getPseudoValue()) {
if (isa<FixedStackPseudoSourceValue>(PSV))
return false;
- return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack();
+ return !PSV->isConstant(nullptr) && !PSV->isStack();
}
return true;
@@ -598,7 +597,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
// Get instruction with delay slot.
MachineBasicBlock::instr_iterator DSI(I);
- if (InMicroMipsMode && TII->GetInstSizeInBytes(std::next(DSI)) == 2 &&
+ if (InMicroMipsMode && TII->GetInstSizeInBytes(&*std::next(DSI)) == 2 &&
DSI->isCall()) {
// If instruction in delay slot is 16b change opcode to
// corresponding instruction with short delay slot.
@@ -713,8 +712,9 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
if (DisableBackwardSearch)
return false;
- RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
- MemDefsUses MemDU(*TM.getDataLayout(), MBB.getParent()->getFrameInfo());
+ auto *Fn = MBB.getParent();
+ RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
+ MemDefsUses MemDU(Fn->getDataLayout(), Fn->getFrameInfo());
ReverseIter Filler;
RegDU.init(*Slot);
@@ -763,6 +763,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
BB2BrMap BrMap;
std::unique_ptr<InspectMemInstr> IM;
Iter Filler;
+ auto *Fn = MBB.getParent();
// Iterate over SuccBB's predecessor list.
for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
@@ -772,15 +773,15 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
// Do not allow moving instructions which have unallocatable register operands
// across basic block boundaries.
- RegDU.setUnallocatableRegs(*MBB.getParent());
+ RegDU.setUnallocatableRegs(*Fn);
// Only allow moving loads from stack or constants if any of the SuccBB's
// predecessors have multiple successors.
if (HasMultipleSuccs) {
IM.reset(new LoadFromStackOrConst());
} else {
- const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo();
- IM.reset(new MemDefsUses(*TM.getDataLayout(), MFI));
+ const MachineFrameInfo *MFI = Fn->getFrameInfo();
+ IM.reset(new MemDefsUses(Fn->getDataLayout(), MFI));
}
if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
@@ -800,12 +801,13 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
// Select the successor with the larget edge weight.
auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
- MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(),
- [&](const MachineBasicBlock *Dst0,
- const MachineBasicBlock *Dst1) {
- return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
- });
- return S->isLandingPad() ? nullptr : S;
+ MachineBasicBlock *S = *std::max_element(
+ B.succ_begin(), B.succ_end(),
+ [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) {
+ return Prob.getEdgeProbability(&B, Dst0) <
+ Prob.getEdgeProbability(&B, Dst1);
+ });
+ return S->isEHPad() ? nullptr : S;
}
std::pair<MipsInstrInfo::BranchType, MachineInstr *>
diff --git a/lib/Target/Mips/MipsEVAInstrFormats.td b/lib/Target/Mips/MipsEVAInstrFormats.td
new file mode 100644
index 000000000000..11e191ad6d82
--- /dev/null
+++ b/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -0,0 +1,84 @@
+//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+ PredicateControl, StdArch {
+ let DecoderNamespace = "Mips";
+ let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA
+def OPCODE6_LBE : OPCODE6<0b101100>;
+def OPCODE6_LBuE : OPCODE6<0b101000>;
+def OPCODE6_LHE : OPCODE6<0b101101>;
+def OPCODE6_LHuE : OPCODE6<0b101001>;
+def OPCODE6_LWE : OPCODE6<0b101111>;
+
+def OPCODE6_SBE : OPCODE6<0b011100>;
+def OPCODE6_SHE : OPCODE6<0b011101>;
+def OPCODE6_SWE : OPCODE6<0b011111>;
+
+// load/store left/right EVA
+def OPCODE6_LWLE : OPCODE6<0b011001>;
+def OPCODE6_LWRE : OPCODE6<0b011010>;
+def OPCODE6_SWLE : OPCODE6<0b100001>;
+def OPCODE6_SWRE : OPCODE6<0b100010>;
+
+// Load-linked EVA, Store-conditional EVA
+def OPCODE6_LLE : OPCODE6<0b101110>;
+def OPCODE6_SCE : OPCODE6<0b011110>;
+
+def OPCODE6_TLBINV : OPCODE6<0b000011>;
+def OPCODE6_TLBINVF : OPCODE6<0b000100>;
+
+def OPCODE6_CACHEE : OPCODE6<0b011011>;
+def OPCODE6_PREFE : OPCODE6<0b100011>;
+
+def OPGROUP_COP0 : OPGROUP<0b010000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst {
+ bits<21> addr;
+ bits<5> hint;
+ bits<5> base = addr{20-16};
+ bits<9> offset = addr{8-0};
+
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+ let Inst{25-21} = base;
+ let Inst{20-16} = hint;
+ let Inst{15-7} = offset;
+ let Inst{6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
+
+class TLB_FM<OPCODE6 Operation> : MipsEVAInst {
+ bits<32> Inst;
+
+ let Inst{31-26} = OPGROUP_COP0.Value;
+ let Inst{25} = 1; // CO
+ let Inst{24-6} = 0;
+ let Inst{5-0} = Operation.Value;
+}
diff --git a/lib/Target/Mips/MipsEVAInstrInfo.td b/lib/Target/Mips/MipsEVAInstrInfo.td
new file mode 100644
index 000000000000..36c9694cbadd
--- /dev/null
+++ b/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -0,0 +1,192 @@
+//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips EVA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction encodings
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA encodings
+class LBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>;
+class LBuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>;
+class LHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>;
+class LHuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>;
+class LWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>;
+
+class SBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>;
+class SHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>;
+class SWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>;
+
+// load/store left/right EVA encodings
+class LWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>;
+class LWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>;
+class SWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>;
+class SWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA encodings
+class LLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>;
+class SCE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>;
+
+class TLBINV_ENC : TLB_FM<OPCODE6_TLBINV>;
+class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>;
+
+class CACHEE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>;
+class PREFE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction descriptions
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA descriptions
+class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ bit canFoldAsLoad = 1;
+ bit mayLoad = 1;
+}
+
+class LBE_DESC : LOAD_EVA_DESC_BASE<"lbe", GPR32Opnd>;
+class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd>;
+class LHE_DESC : LOAD_EVA_DESC_BASE<"lhe", GPR32Opnd>;
+class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd>;
+class LWE_DESC : LOAD_EVA_DESC_BASE<"lwe", GPR32Opnd>;
+
+class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+ SDPatternOperator OpNode = null_frag> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ bit mayStore = 1;
+}
+
+class SBE_DESC : STORE_EVA_DESC_BASE<"sbe", GPR32Opnd>;
+class SHE_DESC : STORE_EVA_DESC_BASE<"she", GPR32Opnd>;
+class SWE_DESC : STORE_EVA_DESC_BASE<"swe", GPR32Opnd>;
+
+// Load/Store Left/Right EVA descriptions
+class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+ string Constraints = "$src = $rt";
+ bit canFoldAsLoad = 1;
+}
+
+class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd>;
+class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd>;
+
+class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeMemEVA";
+}
+
+class SWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd>;
+class SWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd>;
+
+// Load-linked EVA, Store-conditional EVA descriptions
+class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs GPROpnd:$rt);
+ dag InOperandList = (ins mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayLoad = 1;
+ string DecoderMethod = "DecodeMemEVA";
+}
+
+class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd>;
+
+class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+ dag OutOperandList = (outs GPROpnd:$dst);
+ dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+ string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+ list<dag> Pattern = [];
+ bit mayStore = 1;
+ string Constraints = "$rt = $dst";
+ string DecoderMethod = "DecodeMemEVA";
+}
+
+class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd>;
+
+class TLB_DESC_BASE<string instr_asm> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins);
+ string AsmString = instr_asm;
+ list<dag> Pattern = [];
+}
+
+class TLBINV_DESC : TLB_DESC_BASE<"tlbinv">;
+class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf">;
+
+class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd> {
+ dag OutOperandList = (outs);
+ dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+ string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+ list<dag> Pattern = [];
+ string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+}
+
+class CACHEE_DESC : CACHEE_DESC_BASE<"cachee", mem>;
+class PREFE_DESC : CACHEE_DESC_BASE<"prefe", mem>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+/// Load and Store EVA Instructions
+def LBE : LBE_ENC, LBE_DESC, INSN_EVA;
+def LBuE : LBuE_ENC, LBuE_DESC, INSN_EVA;
+def LHE : LHE_ENC, LHE_DESC, INSN_EVA;
+def LHuE : LHuE_ENC, LHuE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWE : LWE_ENC, LWE_DESC, INSN_EVA;
+}
+def SBE : SBE_ENC, SBE_DESC, INSN_EVA;
+def SHE : SHE_ENC, SHE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SWE : SWE_ENC, SWE_DESC, INSN_EVA;
+}
+
+/// load/store left/right EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWLE : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def LWRE : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWLE : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWRE : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+}
+
+/// Load-linked EVA, Store-conditional EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LLE : LLE_ENC, LLE_DESC, INSN_EVA;
+def SCE : SCE_ENC, SCE_DESC, INSN_EVA;
+}
+
+def TLBINV : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
+def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+
+def CACHEE : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
+def PREFE : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 5152a072b3a2..e9eaf810637a 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -192,10 +192,10 @@ public:
TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
Context = &funcInfo.Fn->getContext();
+ bool ISASupported = !Subtarget->hasMips32r6() && Subtarget->hasMips32();
TargetSupported =
- ((TM.getRelocationModel() == Reloc::PIC_) &&
- ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) &&
- (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32())));
+ ISASupported && (TM.getRelocationModel() == Reloc::PIC_) &&
+ (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32());
UnsupportedFPMode = Subtarget->isFP64bit();
}
@@ -236,32 +236,36 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
std::swap(LHS, RHS);
unsigned Opc;
- if (ISDOpc == ISD::AND) {
+ switch (ISDOpc) {
+ case ISD::AND:
Opc = Mips::AND;
- } else if (ISDOpc == ISD::OR) {
+ break;
+ case ISD::OR:
Opc = Mips::OR;
- } else if (ISDOpc == ISD::XOR) {
+ break;
+ case ISD::XOR:
Opc = Mips::XOR;
- } else
+ break;
+ default:
llvm_unreachable("unexpected opcode");
+ }
unsigned LHSReg = getRegForValue(LHS);
- unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
- if (!ResultReg)
- return 0;
-
- unsigned RHSReg;
if (!LHSReg)
return 0;
+ unsigned RHSReg;
if (const auto *C = dyn_cast<ConstantInt>(RHS))
RHSReg = materializeInt(C, MVT::i32);
else
RHSReg = getRegForValue(RHS);
-
if (!RHSReg)
return 0;
+ unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ResultReg)
+ return 0;
+
emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg);
return ResultReg;
}
@@ -747,7 +751,7 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
unsigned Offset = Addr.getOffset();
MachineFrameInfo &MFI = *MF->getFrameInfo();
MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
MFI.getObjectSize(FI), Align);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addFrameIndex(FI)
@@ -798,7 +802,7 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
unsigned Offset = Addr.getOffset();
MachineFrameInfo &MFI = *MF->getFrameInfo();
MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
MFI.getObjectSize(FI), Align);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
.addReg(SrcReg)
@@ -912,8 +916,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
.addReg(CondReg)
.addMBB(TBB);
- fastEmitBranch(FBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
return false;
@@ -1057,22 +1060,16 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
// entirely within FPRs.
unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
- unsigned Opc;
-
- if (SrcVT == MVT::f32)
- Opc = Mips::TRUNC_W_S;
- else
- Opc = Mips::TRUNC_W_D32;
+ unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32;
// Generate the convert.
emitInst(Opc, TempReg).addReg(SrcReg);
-
emitInst(Mips::MFC1, DestReg).addReg(TempReg);
updateValueMap(I, DestReg);
return true;
}
-//
+
bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
SmallVectorImpl<MVT> &OutVTs,
unsigned &NumBytes) {
@@ -1196,7 +1193,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getStack(Addr.getOffset()),
+ MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
(void)(MMO);
// if (!emitStore(ArgVT, ArgReg, Addr, MMO))
@@ -1607,19 +1604,23 @@ bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
unsigned DestReg) {
+ int64_t Imm;
+
switch (SrcVT.SimpleTy) {
default:
return false;
case MVT::i1:
- emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(1);
+ Imm = 1;
break;
case MVT::i8:
- emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xff);
+ Imm = 0xff;
break;
case MVT::i16:
- emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xffff);
+ Imm = 0xffff;
break;
}
+
+ emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm);
return true;
}
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index fab2fdfef8cf..6756c1702f76 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -117,6 +117,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
case MipsISD::GPRel: return "MipsISD::GPRel";
case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer";
case MipsISD::Ret: return "MipsISD::Ret";
+ case MipsISD::ERet: return "MipsISD::ERet";
case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN";
case MipsISD::FPBrcond: return "MipsISD::FPBrcond";
case MipsISD::FPCmp: return "MipsISD::FPCmp";
@@ -390,10 +391,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ if (!Subtarget.isGP64bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+ }
setInsertFencesForAtomic(true);
@@ -437,9 +438,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
- setExceptionPointerRegister(ABI.IsN64() ? Mips::A0_64 : Mips::A0);
- setExceptionSelectorRegister(ABI.IsN64() ? Mips::A1_64 : Mips::A1);
-
MaxStoresPerMemcpy = 16;
isMicroMips = Subtarget.inMicroMipsMode();
@@ -836,6 +834,14 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
return SDValue();
}
+bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+ return Subtarget.hasMips32();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+ return Subtarget.hasMips32();
+}
+
void
MipsTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
@@ -1092,8 +1098,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
MF->insert(It, loopMBB);
MF->insert(It, exitMBB);
@@ -1204,8 +1209,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
MF->insert(It, loopMBB);
MF->insert(It, sinkMBB);
MF->insert(It, exitMBB);
@@ -1330,15 +1334,20 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
DebugLoc DL = MI->getDebugLoc();
unsigned LL, SC, ZERO, BNE, BEQ;
- if (Size == 4) {
- LL = isMicroMips ? Mips::LL_MM : Mips::LL;
- SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+ if (Size == 4) {
+ if (isMicroMips) {
+ LL = Mips::LL_MM;
+ SC = Mips::SC_MM;
+ } else {
+ LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
+ SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
+ }
ZERO = Mips::ZERO;
BNE = Mips::BNE;
BEQ = Mips::BEQ;
} else {
- LL = Mips::LLD;
- SC = Mips::SCD;
+ LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+ SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
ZERO = Mips::ZERO_64;
BNE = Mips::BNE64;
BEQ = Mips::BEQ64;
@@ -1356,8 +1365,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
MF->insert(It, loop1MBB);
MF->insert(It, loop2MBB);
MF->insert(It, exitMBB);
@@ -1440,8 +1448,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
MF->insert(It, loop1MBB);
MF->insert(It, loop2MBB);
MF->insert(It, sinkMBB);
@@ -1586,9 +1593,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
- Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
- MachinePointerInfo::getJumpTable(), MemVT, false, false,
- false, 0);
+ Addr =
+ DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+ MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
+ MemVT, false, false, false, 0);
Chain = Addr.getValue(1);
if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) {
@@ -1690,14 +1698,15 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
if (LargeGOT)
- return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16,
- MipsII::MO_GOT_LO16, DAG.getEntryNode(),
- MachinePointerInfo::getGOT());
+ return getAddrGlobalLargeGOT(
+ N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
+ DAG.getEntryNode(),
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()));
- return getAddrGlobal(N, SDLoc(N), Ty, DAG,
- (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP
- : MipsII::MO_GOT16,
- DAG.getEntryNode(), MachinePointerInfo::getGOT());
+ return getAddrGlobal(
+ N, SDLoc(N), Ty, DAG,
+ (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+ DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
}
SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
@@ -1719,6 +1728,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
// Local Exec TLS Model.
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
SDLoc DL(GA);
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -1813,7 +1825,8 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
static_cast<const MipsTargetObjectFile *>(
getTargetMachine().getObjFileLowering());
- if (TLOF->IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
+ if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(),
+ getTargetMachine()))
// %gp_rel relocation
return getAddrGPRel(N, SDLoc(N), Ty, DAG);
@@ -2946,8 +2959,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
- Function::const_arg_iterator FuncArg =
- DAG.getMachineFunction().getFunction()->arg_begin();
+ const Function *Func = DAG.getMachineFunction().getFunction();
+ Function::const_arg_iterator FuncArg = Func->arg_begin();
+
+ if (Func->hasFnAttribute("interrupt") && !Func->arg_empty())
+ report_fatal_error(
+ "Functions with the interrupt attribute cannot have arguments!");
CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -3019,7 +3036,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
// We ought to be able to use LocVT directly but O32 sets it to i32
// when allocating floating point values to integer registers.
// This shouldn't influence how we load the value into registers unless
- // we are targetting softfloat.
+ // we are targeting softfloat.
if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat())
LocVT = VA.getValVT();
}
@@ -3033,9 +3050,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
// Create load nodes to retrieve arguments from the stack
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ SDValue ArgValue = DAG.getLoad(
+ LocVT, DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, false, 0);
OutChains.push_back(ArgValue.getValue(1));
ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
@@ -3098,8 +3116,20 @@ MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const
}
SDValue
-MipsTargetLowering::LowerReturn(SDValue Chain,
- CallingConv::ID CallConv, bool IsVarArg,
+MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+ SDLoc DL, SelectionDAG &DAG) const {
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+ MipsFI->setISR();
+
+ return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
SDLoc DL, SelectionDAG &DAG) const {
@@ -3192,7 +3222,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
if (Flag.getNode())
RetOps.push_back(Flag);
- // Return on Mips is always a "jr $ra"
+ // ISRs must use "eret".
+ if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt"))
+ return LowerInterruptReturn(RetOps, DL, DAG);
+
+ // Standard return on Mips is a "jr $ra"
return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
}
@@ -3300,7 +3334,7 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
// Search for the first numeric character.
StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
- I = std::find_if(B, E, std::ptr_fun(isdigit));
+ I = std::find_if(B, E, isdigit);
Prefix = StringRef(B, I - B);
@@ -3669,7 +3703,7 @@ void MipsTargetLowering::passByValArg(
unsigned NumRegs = LastReg - FirstReg;
if (NumRegs) {
- const ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
+ ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
unsigned I = 0;
@@ -3755,7 +3789,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
SDValue Chain, SDLoc DL,
SelectionDAG &DAG,
CCState &State) const {
- const ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+ ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
unsigned Idx = State.getFirstUnallocated(ArgRegs);
unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
@@ -3812,7 +3846,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
if (State->getCallingConv() != CallingConv::Fast) {
unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
- const ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
+ ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
// FIXME: The O32 case actually describes no shadow registers.
const MCPhysReg *ShadowRegs =
ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;
@@ -3860,8 +3894,7 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index b3d861d34da7..b33e125b81b7 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -67,6 +67,10 @@ namespace llvm {
// Return
Ret,
+ // Interrupt, exception, error trap Return
+ ERet,
+
+ // Software Exception Return.
EH_RETURN,
// Node used to extract integer from accumulator.
@@ -231,6 +235,9 @@ namespace llvm {
return MVT::i32;
}
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
void LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
@@ -258,17 +265,25 @@ namespace llvm {
EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *MBB) const override;
- struct LTStr {
- bool operator()(const char *S1, const char *S2) const {
- return strcmp(S1, S2) < 0;
- }
- };
-
void HandleByVal(CCState *, unsigned &, unsigned) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return ABI.IsN64() ? Mips::A0_64 : Mips::A0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
+ }
+
/// Returns true if a cast between SrcAS and DestAS is a noop.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
// Mips doesn't have any special address spaces so we just reserve
@@ -290,9 +305,10 @@ namespace llvm {
unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
getTargetNode(N, Ty, DAG, GOTFlag));
- SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
- MachinePointerInfo::getGOT(), false, false,
- false, 0);
+ SDValue Load =
+ DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
getTargetNode(N, Ty, DAG, LoFlag));
@@ -487,6 +503,9 @@ namespace llvm {
const SmallVectorImpl<SDValue> &OutVals,
SDLoc dl, SelectionDAG &DAG) const override;
+ SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, SDLoc DL,
+ SelectionDAG &DAG) const;
+
bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
// Inline asm support
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index cb912253b28c..377260f89d10 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
multiclass ROUND_M<string opstr, InstrItinClass Itin> {
def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
- def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+ def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
let DecoderNamespace = "Mips64";
}
}
@@ -267,24 +267,25 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
//===----------------------------------------------------------------------===//
// Floating Point Instructions
//===----------------------------------------------------------------------===//
-def ROUND_W_S : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
ABSS_FM<0xc, 16>, ISA_MIPS2;
-def TRUNC_W_S : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
ABSS_FM<0xd, 16>, ISA_MIPS2;
-def CEIL_W_S : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
ABSS_FM<0xe, 16>, ISA_MIPS2;
-def FLOOR_W_S : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
ABSS_FM<0xf, 16>, ISA_MIPS2;
def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
ABSS_FM<0x24, 16>;
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
let DecoderNamespace = "Mips64" in {
+ let AdditionalPredicates = [NotInMicroMips] in {
def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
ABSS_FM<0x8, 16>, FGR_64;
def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
@@ -301,14 +302,17 @@ let DecoderNamespace = "Mips64" in {
ABSS_FM<0xb, 16>, FGR_64;
def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
ABSS_FM<0xb, 17>, FGR_64;
+ }
}
def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
ABSS_FM<0x20, 20>;
-def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
- ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
-def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
- ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+let AdditionalPredicates = [NotInMicroMips] in{
+ def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+ ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
+ def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+}
def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
ABSS_FM<0x20, 17>, FGR_32;
@@ -320,8 +324,10 @@ def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
let DecoderNamespace = "Mips64" in {
def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
ABSS_FM<0x20, 17>, FGR_64;
- def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
- ABSS_FM<0x20, 21>, FGR_64;
+ let AdditionalPredicates = [NotInMicroMips] in{
+ def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
+ ABSS_FM<0x20, 21>, FGR_64;
+ }
def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
ABSS_FM<0x21, 20>, FGR_64;
def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
@@ -345,8 +351,8 @@ def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
-def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
- ABSS_FM<0x4, 16>, ISA_MIPS2;
+def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+ II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
// The odd-numbered registers are only referenced when doing loads,
@@ -503,13 +509,13 @@ let AdditionalPredicates = [NoNaNsFPMath],
def MIPS_BRANCH_F : PatLeaf<(i32 0)>;
def MIPS_BRANCH_T : PatLeaf<(i32 1)>;
-def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>,
BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, IIBranch, MIPS_BRANCH_F, 0>,
+def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>,
BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6;
-def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>,
BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
-def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, IIBranch, MIPS_BRANCH_T, 0>,
+def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>,
BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
/// Floating Point Compare
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 5f4fcc354616..45baf27be518 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -132,7 +132,7 @@ class PseudoSE<dag outs, dag ins, list<dag> pattern,
// These are aliases that require C++ handling to convert to the target
// instruction, while InstAliases can be handled directly by tblgen.
class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
- MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
+ MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
let isPseudo = 1;
let Pattern = [];
}
@@ -644,16 +644,16 @@ class BRK_FM<bits<6> funct> : StdArch
// Exception return format <Cop0|1|0|funct>
//===----------------------------------------------------------------------===//
-class ER_FM<bits<6> funct> : StdArch
+class ER_FM<bits<6> funct, bit LLBit> : StdArch
{
bits<32> Inst;
let Inst{31-26} = 0x10;
let Inst{25} = 1;
- let Inst{24-6} = 0;
+ let Inst{24-7} = 0;
+ let Inst{6} = LLBit;
let Inst{5-0} = funct;
}
-
//===----------------------------------------------------------------------===//
// Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index bb23cc04e696..b1d69506c16f 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -60,8 +60,8 @@ MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag,
- MFI.getObjectSize(FI), Align);
+ return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+ Flag, MFI.getObjectSize(FI), Align);
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index ab98c9054e74..d9fb8c890739 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -77,6 +77,9 @@ def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MipsERet : SDNode<"MipsISD::ERet", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>;
+
// These are target-independent nodes, but have target-specific formats.
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
[SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
@@ -157,7 +160,7 @@ def HasMips3 : Predicate<"Subtarget->hasMips3()">,
def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">,
AssemblerPredicate<"FeatureMips4_32">;
def NotMips4_32 : Predicate<"!Subtarget->hasMips4_32()">,
- AssemblerPredicate<"FeatureMips4_32">;
+ AssemblerPredicate<"!FeatureMips4_32">;
def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">,
AssemblerPredicate<"FeatureMips4_32r2">;
def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">,
@@ -166,6 +169,8 @@ def HasMips32 : Predicate<"Subtarget->hasMips32()">,
AssemblerPredicate<"FeatureMips32">;
def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">,
AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r5 : Predicate<"Subtarget->hasMips32r5()">,
+ AssemblerPredicate<"FeatureMips32r5">;
def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">,
AssemblerPredicate<"FeatureMips32r6">;
def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">,
@@ -176,6 +181,8 @@ def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">,
AssemblerPredicate<"!FeatureGP64Bit">;
def HasMips64 : Predicate<"Subtarget->hasMips64()">,
AssemblerPredicate<"FeatureMips64">;
+def NotMips64 : Predicate<"!Subtarget->hasMips64()">,
+ AssemblerPredicate<"!FeatureMips64">;
def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">,
AssemblerPredicate<"FeatureMips64r2">;
def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">,
@@ -184,6 +191,8 @@ def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">,
AssemblerPredicate<"!FeatureMips64r6">;
def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
+def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
+ AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">,
AssemblerPredicate<"FeatureMips16">;
def HasCnMips : Predicate<"Subtarget->hasCnMips()">,
@@ -201,6 +210,12 @@ def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">,
def IsLE : Predicate<"Subtarget->isLittle()">;
def IsBE : Predicate<"!Subtarget->isLittle()">;
def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
+def UseTCCInDIV : AssemblerPredicate<"FeatureUseTCCInDIV">;
+def HasEVA : Predicate<"Subtarget->hasEVA()">,
+ AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
+ AssemblerPredicate<"FeatureMSA">;
+
//===----------------------------------------------------------------------===//
// Mips GPR size adjectives.
@@ -242,6 +257,7 @@ class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
class ISA_MIPS32R2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
}
+class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; }
class ISA_MIPS64_NOT_64R6 {
list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
@@ -249,9 +265,21 @@ class ISA_MIPS64_NOT_64R6 {
class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
class ISA_MICROMIPS32R6 {
list<Predicate> InsnPredicates = [HasMicroMips32r6];
}
+class ISA_MICROMIPS64R6 {
+ list<Predicate> InsnPredicates = [HasMicroMips64r6];
+}
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+ list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+}
+
+class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
+class INSN_EVA_NOT_32R6_64R6 {
+ list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+}
// The portions of MIPS-III that were also added to MIPS32
class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
@@ -283,6 +311,28 @@ class INSN_MIPS5_32R2_NOT_32R6_64R6 {
list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
}
+class ASE_CNMIPS {
+ list<Predicate> InsnPredicates = [HasCnMips];
+}
+
+class ASE_MSA {
+ list<Predicate> InsnPredicates = [HasMSA];
+}
+
+class ASE_MSA_NOT_MSA64 {
+ list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+}
+
+class ASE_MSA64 {
+ list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+}
+
+// Class used for separating microMIPSr6 and microMIPS (r3) instruction.
+// It can be used only on instructions that doesn't inherit PredicateControl.
+class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
+ let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6];
+}
+
//===----------------------------------------------------------------------===//
class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
@@ -335,6 +385,81 @@ include "MipsInstrFormats.td"
// Mips Operand, Complex Patterns and Transformations Definitions.
//===----------------------------------------------------------------------===//
+class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+ : AsmOperandClass {
+ let Name = "ConstantSImm" # Bits;
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isConstantSImm<" # Bits # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "SImm" # Bits;
+}
+
+class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+ int Offset = 0> : AsmOperandClass {
+ let Name = "ConstantUImm" # Bits # "_" # Offset;
+ let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">";
+ let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">";
+ let SuperClasses = Supers;
+ let DiagnosticType = "UImm" # Bits # "_" # Offset;
+}
+
+def ConstantUImm10AsmOperandClass
+ : ConstantUImmAsmOperandClass<10, []>;
+def ConstantUImm8AsmOperandClass
+ : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>;
+def ConstantUImm7AsmOperandClass
+ : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass]>;
+def ConstantUImm6AsmOperandClass
+ : ConstantUImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
+def ConstantSImm6AsmOperandClass
+ : ConstantSImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>;
+def ConstantUImm5Plus1AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 1>;
+def ConstantUImm5Plus32AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
+def ConstantUImm5Plus33AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 33>;
+def ConstantUImm5Plus32NormalizeAsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> {
+ let Name = "ConstantUImm5_32_Norm";
+ // We must also subtract 32 when we render the operand.
+ let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
+}
+def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
+ let Name = "UImm5Lsl2";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isScaledUImm<5, 2>";
+ let SuperClasses = [ConstantUImm6AsmOperandClass];
+ let DiagnosticType = "UImm5_Lsl2";
+}
+def ConstantUImm5ReportUImm6AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> {
+ let Name = "ConstantUImm5_0_Report_UImm6";
+ let DiagnosticType = "UImm5_0_Report_UImm6";
+}
+def ConstantUImm5AsmOperandClass
+ : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>;
+def ConstantUImm4AsmOperandClass
+ : ConstantUImmAsmOperandClass<
+ 4, [ConstantUImm5AsmOperandClass,
+ ConstantUImm5Plus32AsmOperandClass,
+ ConstantUImm5Plus32NormalizeAsmOperandClass]>;
+def ConstantUImm3AsmOperandClass
+ : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>;
+def ConstantUImm2Plus1AsmOperandClass
+ : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>;
+def ConstantUImm2AsmOperandClass
+ : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>;
+def ConstantUImm1AsmOperandClass
+ : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>;
+def ConstantImmzAsmOperandClass : AsmOperandClass {
+ let Name = "ConstantImmz";
+ let RenderMethod = "addConstantUImmOperands<1>";
+ let PredicateMethod = "isConstantImmz";
+ let SuperClasses = [ConstantUImm1AsmOperandClass];
+ let DiagnosticType = "Immz";
+}
+
def MipsJumpTargetAsmOperand : AsmOperandClass {
let Name = "JumpTarget";
let ParserMethod = "parseJumpTarget";
@@ -360,6 +485,10 @@ def calltarget : Operand<iPTR> {
def imm64: Operand<i64>;
+def simm6 : Operand<i32> {
+ let ParserMatchClass = ConstantSImm6AsmOperandClass;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
def simm9 : Operand<i32>;
def simm10 : Operand<i32>;
def simm11 : Operand<i32>;
@@ -380,23 +509,12 @@ def simm18_lsl3 : Operand<i32> {
let ParserMatchClass = MipsJumpTargetAsmOperand;
}
-def simm20 : Operand<i32> {
-}
+def simm20 : Operand<i32>;
+def simm32 : Operand<i32>;
def uimm20 : Operand<i32> {
}
-def MipsUImm10AsmOperand : AsmOperandClass {
- let Name = "UImm10";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseImm";
- let PredicateMethod = "isUImm<10>";
-}
-
-def uimm10 : Operand<i32> {
- let ParserMatchClass = MipsUImm10AsmOperand;
-}
-
def simm16_64 : Operand<i64> {
let DecoderMethod = "DecodeSimm16";
}
@@ -404,23 +522,71 @@ def simm16_64 : Operand<i64> {
// Zero
def uimmz : Operand<i32> {
let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass = ConstantImmzAsmOperandClass;
+}
+
+// Unsigned Operands
+foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10} in
+ def uimm # I : Operand<i32> {
+ let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+def uimm2_plus1 : Operand<i32> {
+ let PrintMethod = "printUnsignedImm";
+ let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<2, 1>";
+ let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
}
-// Unsigned Operand
-def uimm2 : Operand<i32> {
+def uimm5_plus1 : Operand<i32> {
let PrintMethod = "printUnsignedImm";
+ let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+ let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
}
-def uimm3 : Operand<i32> {
+def uimm5_plus32 : Operand<i32> {
let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
}
-def uimm5 : Operand<i32> {
+def uimm5_plus33 : Operand<i32> {
let PrintMethod = "printUnsignedImm";
+ let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+ let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+ let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass;
}
-def uimm6 : Operand<i32> {
+def uimm5_plus32_normalize : Operand<i32> {
let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm5_lsl2 : Operand<OtherVT> {
+ let EncoderMethod = "getUImm5Lsl2Encoding";
+ let DecoderMethod = "DecodeUImm5lsl2";
+ let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass;
+}
+
+def uimm5_plus32_normalize_64 : Operand<i64> {
+ let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+foreach I = {5} in
+ def uimm # I # _64 : Operand<i64> {
+ let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+ }
+
+// Like uimm5_64 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_64_report_uimm6 : Operand<i64> {
+ let PrintMethod = "printUnsignedImm";
+ let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
}
def uimm16 : Operand<i32> {
@@ -435,6 +601,22 @@ def MipsMemAsmOperand : AsmOperandClass {
let ParserMethod = "parseMemOperand";
}
+def MipsMemSimm9AsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm9";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffset<9>";
+}
+
+def MipsMemSimm9GPRAsmOperand : AsmOperandClass {
+ let Name = "MemOffsetSimm9GPR";
+ let SuperClasses = [MipsMemAsmOperand];
+ let RenderMethod = "addMemOperands";
+ let ParserMethod = "parseMemOperand";
+ let PredicateMethod = "isMemWithSimmOffsetGPR<9>";
+}
+
def MipsMemSimm11AsmOperand : AsmOperandClass {
let Name = "MemOffsetSimm11";
let SuperClasses = [MipsMemAsmOperand];
@@ -485,6 +667,13 @@ def mem_msa : mem_generic {
def mem_simm9 : mem_generic {
let MIOperandInfo = (ops ptr_rc, simm9);
let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm9AsmOperand;
+}
+
+def mem_simm9gpr : mem_generic {
+ let MIOperandInfo = (ops ptr_rc, simm9);
+ let EncoderMethod = "getMemEncoding";
+ let ParserMatchClass = MipsMemSimm9GPRAsmOperand;
}
def mem_simm11 : mem_generic {
@@ -512,12 +701,6 @@ def PtrRC : Operand<iPTR> {
let ParserMatchClass = GPR32AsmOperand;
}
-// size operand of ext instruction
-def size_ext : Operand<i32> {
- let EncoderMethod = "getSizeExtEncoding";
- let DecoderMethod = "DecodeExtSize";
-}
-
// size operand of ins instruction
def size_ins : Operand<i32> {
let EncoderMethod = "getSizeInsEncoding";
@@ -657,7 +840,7 @@ class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
[(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
opstr>;
-// Load Upper Imediate
+// Load Upper Immediate
class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
[], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
@@ -675,14 +858,19 @@ class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
let mayLoad = 1;
}
-class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
+ SDPatternOperator OpNode = null_frag,
InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
- InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+ InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
[(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
let DecoderMethod = "DecodeMem";
let mayStore = 1;
}
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+ InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+ StoreMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
// Load/Store Left/Right
let canFoldAsLoad = 1 in
class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
@@ -740,7 +928,7 @@ class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
RegisterOperand RO, bit DelaySlot = 1> :
InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
!strconcat(opstr, "\t$rs, $rt, $offset"),
- [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
+ [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC,
FrmI, opstr> {
let isBranch = 1;
let isTerminator = 1;
@@ -752,7 +940,7 @@ class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
RegisterOperand RO, bit DelaySlot = 1> :
InstSE<(outs), (ins RO:$rs, opnd:$offset),
!strconcat(opstr, "\t$rs, $offset"),
- [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+ [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ,
FrmI, opstr> {
let isBranch = 1;
let isTerminator = 1;
@@ -778,7 +966,7 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
SDPatternOperator targetoperator, string bopstr> :
InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
- [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
+ [(operator targetoperator:$target)], II_J, FrmJ, bopstr> {
let isTerminator=1;
let isBarrier=1;
let hasDelaySlot = 1;
@@ -788,7 +976,7 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
// Unconditional branch
class UncondBranch<Instruction BEQInst> :
- PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+ PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>,
PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
let isBranch = 1;
let isTerminator = 1;
@@ -802,7 +990,7 @@ class UncondBranch<Instruction BEQInst> :
let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
class JumpFR<string opstr, RegisterOperand RO,
SDPatternOperator operator = null_frag>:
- InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+ InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR,
FrmR, opstr>;
// Indirect branch
@@ -815,23 +1003,23 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
let isCall=1, hasDelaySlot=1, Defs = [RA] in {
class JumpLink<string opstr, DAGOperand opnd> :
InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
- [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
+ [(MipsJmpLink imm:$target)], II_JAL, FrmJ, opstr> {
let DecoderMethod = "DecodeJumpTarget";
}
class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
Register RetReg, RegisterOperand ResRO = RO>:
- PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>,
+ PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>,
PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
class JumpLinkReg<string opstr, RegisterOperand RO>:
InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
- [], IIBranch, FrmR>;
+ [], II_JALR, FrmR, opstr>;
class BGEZAL_FT<string opstr, DAGOperand opnd,
RegisterOperand RO, bit DelaySlot = 1> :
InstSE<(outs), (ins RO:$rs, opnd:$offset),
- !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr> {
+ !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> {
let hasDelaySlot = DelaySlot;
}
@@ -840,17 +1028,17 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
hasExtraSrcRegAllocReq = 1, Defs = [AT] in {
class TailCall<Instruction JumpInst> :
- PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>,
+ PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
PseudoInstExpansion<(JumpInst jmptarget:$target)>;
class TailCallReg<RegisterOperand RO, Instruction JRInst,
RegisterOperand ResRO = RO> :
- PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>,
+ PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>,
PseudoInstExpansion<(JRInst ResRO:$rs)>;
}
class BAL_BR_Pseudo<Instruction RealInst> :
- PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>,
+ PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
let isBranch = 1;
let isTerminator = 1;
@@ -997,9 +1185,10 @@ class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
[(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
// Subword Swap
-class SubwordSwap<string opstr, RegisterOperand RO>:
- InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
- NoItinerary, FrmR, opstr> {
+class SubwordSwap<string opstr, RegisterOperand RO,
+ InstrItinClass itin = NoItinerary>:
+ InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin,
+ FrmR, opstr> {
let hasSideEffects = 0;
}
@@ -1010,8 +1199,8 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
// Ext and Ins
class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
- SDPatternOperator Op = null_frag>:
- InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
+ Operand SizeOpnd, SDPatternOperator Op = null_frag> :
+ InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
!strconcat(opstr, " $rt, $rs, $pos, $size"),
[(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT,
FrmR, opstr>, ISA_MIPS32R2;
@@ -1074,6 +1263,9 @@ class TrapBase<Instruction RealInst>
let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, hasSideEffects=1 in
+def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+
let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
[(callseq_start timm:$amt)]>;
@@ -1215,10 +1407,11 @@ def LH : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
LW_FM<0x21>;
def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
let AdditionalPredicates = [NotInMicroMips] in {
-def LW : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+def LW : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
LW_FM<0x23>;
}
-def SB : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>;
+def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+ LW_FM<0x28>;
def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
let AdditionalPredicates = [NotInMicroMips] in {
def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
@@ -1259,15 +1452,17 @@ let DecoderNamespace = "COP3_" in {
}
}
-def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
-def SYNCI : MMRel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
+def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
-def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
-def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
-def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
-def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
-def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
-def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
+let AdditionalPredicates = [NotInMicroMips] in {
+ def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
+ def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
+ def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
+ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
+ def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
+ def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
+}
def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
ISA_MIPS2_NOT_32R6_64R6;
@@ -1290,14 +1485,15 @@ def TRAP : TrapBase<BREAK>;
def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
let AdditionalPredicates = [NotInMicroMips] in {
-def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
+ def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
+ def ERETNC : MMRel, ER_FT<"eretnc">, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+ def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f, 0x0>, ISA_MIPS32;
}
-def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
let AdditionalPredicates = [NotInMicroMips] in {
-def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+ def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+ def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
}
-def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
AdditionalPredicates = [NotInMicroMips] in {
@@ -1359,7 +1555,8 @@ def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
class PseudoIndirectBranchBase<RegisterOperand RO> :
- MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> {
+ MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+ II_IndirectBranchPseudo> {
let isTerminator=1;
let isBarrier=1;
let hasDelaySlot = 1;
@@ -1369,12 +1566,12 @@ class PseudoIndirectBranchBase<RegisterOperand RO> :
def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
-// Return instructions are matched as a RetRA instruction, then ar expanded
+// Return instructions are matched as a RetRA instruction, then are expanded
// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
// ISA.
class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
- [], IIBranch> {
+ [], II_ReturnPseudo> {
let isTerminator = 1;
let isBarrier = 1;
let hasDelaySlot = 1;
@@ -1441,8 +1638,11 @@ def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
ISA_MIPS32_NOT_32R6_64R6;
-/// Word Swap Bytes Within Halfwords
-def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
+let AdditionalPredicates = [NotInMicroMips] in {
+ /// Word Swap Bytes Within Halfwords
+ def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
+ ISA_MIPS32R2;
+}
/// No operation.
def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1485,10 +1685,12 @@ def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
-
+let AdditionalPredicates = [NotInMicroMips] in {
def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
-
-def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+}
+// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, MipsExt>,
+ EXT_FM<0>;
def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
/// Move Control Registers From/To CPU Registers
@@ -1499,9 +1701,9 @@ def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd>, MFC3OP_FM<0x12, 4>;
class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
FrmOther, asmstr>;
-def SSNOP : MMRel, Barrier<"ssnop">, BARRIER_FM<1>;
+def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop">, BARRIER_FM<1>;
def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : MMRel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
// JR_HB and JALR_HB are defined here using the new style naming
// scheme because some of this code is shared with Mips32r6InstrInfo.td
@@ -1562,11 +1764,60 @@ def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
def PREF : MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
INSN_MIPS3_32_NOT_32R6_64R6;
+def ROL : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "rol\t$rs, $rt, $rd">;
+def ROLImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "rol\t$rs, $rt, $imm">;
+def : MipsInstAlias<"rol $rd, $rs",
+ (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"rol $rd, $imm",
+ (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def ROR : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "ror\t$rs, $rt, $rd">;
+def RORImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "ror\t$rs, $rt, $imm">;
+def : MipsInstAlias<"ror $rd, $rs",
+ (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"ror $rd, $imm",
+ (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def DROL : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "drol\t$rs, $rt, $rd">, ISA_MIPS64;
+def DROLImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "drol\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $rs",
+ (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $imm",
+ (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def DROR : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+ "dror\t$rs, $rt, $rd">, ISA_MIPS64;
+def DRORImm : MipsAsmPseudoInst<(outs),
+ (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "dror\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $rs",
+ (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $imm",
+ (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
//===----------------------------------------------------------------------===//
// Instruction aliases
//===----------------------------------------------------------------------===//
def : MipsInstAlias<"move $dst, $src",
- (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
+ (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+ GPR_32 {
+ let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"move $dst, $src",
+ (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
GPR_32 {
let AdditionalPredicates = [NotInMicroMips];
}
@@ -1630,27 +1881,27 @@ def : MipsInstAlias<"beqz $rs,$offset",
def : MipsInstAlias<"beqzl $rs,$offset",
(BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
-
+
def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
let AdditionalPredicates = [NotInMicroMips] in {
-def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
-}
-def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
-
-def : MipsInstAlias<"teq $rs, $rt",
- (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tge $rs, $rt",
- (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tgeu $rs, $rt",
- (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tlt $rs, $rt",
- (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tltu $rs, $rt",
- (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-def : MipsInstAlias<"tne $rs, $rt",
- (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
-
+ def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
+ def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+ def : MipsInstAlias<"teq $rs, $rt",
+ (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tge $rs, $rt",
+ (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tgeu $rs, $rt",
+ (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tlt $rs, $rt",
+ (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tltu $rs, $rt",
+ (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+ def : MipsInstAlias<"tne $rs, $rt",
+ (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+}
def : MipsInstAlias<"sll $rd, $rt, $rs",
(SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
def : MipsInstAlias<"sub, $rd, $rs, $imm",
@@ -1678,7 +1929,7 @@ def : MipsInstAlias<"sync",
class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> :
MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
!strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32 : LoadImmediate32<"li", uimm5, GPR32Opnd>;
+def LoadImm32 : LoadImmediate32<"li", simm32, GPR32Opnd>;
class LoadAddressFromReg32<string instr_asm, Operand MemOpnd,
RegisterOperand RO> :
@@ -1689,13 +1940,16 @@ def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>;
class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> :
MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
!strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddrImm32 : LoadAddressFromImm32<"la", uimm5, GPR32Opnd>;
+def LoadAddrImm32 : LoadAddressFromImm32<"la", simm32, GPR32Opnd>;
def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
"jal\t$rd, $rs"> ;
def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
"jal\t$rs"> ;
+def NORImm : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+ "nor\t$rs, $rt, $imm"> ;
+
let hasDelaySlot = 1 in {
def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
(ins imm64:$imm64, brtarget:$offset),
@@ -1718,12 +1972,62 @@ def BLTU : CondBranchPseudo<"bltu">;
def BLEU : CondBranchPseudo<"bleu">;
def BGEU : CondBranchPseudo<"bgeu">;
def BGTU : CondBranchPseudo<"bgtu">;
+def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+class CondBranchImmPseudo<string instr_asm> :
+ MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+ !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
+
+def BLTImmMacro : CondBranchImmPseudo<"blt">;
+def BLEImmMacro : CondBranchImmPseudo<"ble">;
+def BGEImmMacro : CondBranchImmPseudo<"bge">;
+def BGTImmMacro : CondBranchImmPseudo<"bgt">;
+def BLTUImmMacro : CondBranchImmPseudo<"bltu">;
+def BLEUImmMacro : CondBranchImmPseudo<"bleu">;
+def BGEUImmMacro : CondBranchImmPseudo<"bgeu">;
+def BGTUImmMacro : CondBranchImmPseudo<"bgtu">;
+def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+// FIXME: Predicates are removed because instructions are matched regardless of
+// predicates, because PredicateControl was not in the hierarchy. This was
+// done to emit more precise error message from expansion function.
+// Once the tablegen-erated errors are made better, this needs to be fixed and
+// predicates needs to be restored.
+
+def SDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "div\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def UDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "divu\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def DSDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "ddiv\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
+
+def DUDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+ "ddivu\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6;
+
+def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+ "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
- "ulhu\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6;
+ "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
- "ulw\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6;
+ "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
//===----------------------------------------------------------------------===//
// Arbitrary patterns that map to one or more instructions
@@ -1939,6 +2243,16 @@ let AddedComplexity = 40 in {
}
}
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+
//===----------------------------------------------------------------------===//
// Floating Point Support
//===----------------------------------------------------------------------===//
@@ -1964,6 +2278,10 @@ include "MipsDSPInstrInfo.td"
include "MipsMSAInstrFormats.td"
include "MipsMSAInstrInfo.td"
+// EVA
+include "MipsEVAInstrFormats.td"
+include "MipsEVAInstrInfo.td"
+
// Micromips
include "MicroMipsInstrFormats.td"
include "MicroMipsInstrInfo.td"
@@ -1972,3 +2290,11 @@ include "MicroMipsInstrFPU.td"
// Micromips r6
include "MicroMips32r6InstrFormats.td"
include "MicroMips32r6InstrInfo.td"
+
+// Micromips64 r6
+include "MicroMips64r6InstrFormats.td"
+include "MicroMips64r6InstrInfo.td"
+
+// Micromips DSP
+include "MicroMipsDSPInstrFormats.td"
+include "MicroMipsDSPInstrInfo.td"
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 90f8cc0cacfd..49fb99a8ec43 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -148,7 +148,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
// Insert NewMBB and fix control flow.
MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
NewMBB->transferSuccessors(MBB);
- NewMBB->removeSuccessor(Tgt);
+ NewMBB->removeSuccessor(Tgt, true);
MBB->addSuccessor(NewMBB);
MBB->addSuccessor(Tgt);
MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
@@ -161,7 +161,7 @@ void MipsLongBranch::initMBBInfo() {
// Split the MBBs if they have two branches. Each basic block should have at
// most one branch after this loop is executed.
for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
- splitMBB(I++);
+ splitMBB(&*I++);
MF->RenumberBlocks();
MBBInfos.clear();
@@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
MF->insert(FallThroughMBB, LongBrMBB);
- MBB->removeSuccessor(TgtMBB);
- MBB->addSuccessor(LongBrMBB);
+ MBB->replaceSuccessor(TgtMBB, LongBrMBB);
if (IsPIC) {
MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
@@ -434,7 +433,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
} else
// Change branch destination and reverse condition.
- replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+ replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB);
}
static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
index bff2d0fab1ec..7d25ea56e3d5 100644
--- a/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -7,18 +7,12 @@
//
//===----------------------------------------------------------------------===//
-def HasMSA : Predicate<"Subtarget->hasMSA()">,
- AssemblerPredicate<"FeatureMSA">;
-
-class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
- let Predicates = [HasMSA];
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+ PredicateControl, ASE_MSA {
+ let EncodingPredicates = [HasStdEnc];
let Inst{31-26} = 0b011110;
}
-class MSA64Inst : MSAInst {
- let Predicates = [HasMSA, HasMips64];
-}
-
class MSACBranch : MSAInst {
let Inst{31-26} = 0b010001;
}
@@ -27,10 +21,6 @@ class MSASpecial : MSAInst {
let Inst{31-26} = 0b000000;
}
-class MSA64Special : MSA64Inst {
- let Inst{31-26} = 0b000000;
-}
-
class MSAPseudo<dag outs, dag ins, list<dag> pattern,
InstrItinClass itin = IIPseudo>:
MipsPseudo<outs, ins, pattern, itin> {
@@ -100,7 +90,7 @@ class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
let Inst{5-0} = minor;
}
-class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst {
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
bits<5> rs;
bits<5> wd;
@@ -293,7 +283,7 @@ class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
let Inst{5-0} = minor;
}
-class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
bits<4> n;
bits<5> ws;
bits<5> rd;
@@ -345,7 +335,7 @@ class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
let Inst{5-0} = minor;
}
-class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
bits<6> n;
bits<5> rs;
bits<5> wd;
@@ -450,7 +440,7 @@ class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
let Inst{5-0} = minor;
}
-class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special {
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial {
bits<5> rs;
bits<5> rt;
bits<5> rd;
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 970e98ea9e1e..eacfcec78bc7 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -63,30 +63,13 @@ def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
def immZExt6Ptr : ImmLeaf<iPTR, [{return isUInt<6>(Imm);}]>;
// Operands
-// The immediate of an LSA instruction needs special handling
-// as the encoded value should be subtracted by one.
-def uimm2LSAAsmOperand : AsmOperandClass {
- let Name = "LSAImm";
- let ParserMethod = "parseLSAImm";
- let RenderMethod = "addImmOperands";
-}
-
-def LSAImm : Operand<i32> {
- let PrintMethod = "printUnsignedImm";
- let EncoderMethod = "getLSAImmEncoding";
- let DecoderMethod = "DecodeLSAImm";
- let ParserMatchClass = uimm2LSAAsmOperand;
-}
-
-def uimm4 : Operand<i32> {
- let PrintMethod = "printUnsignedImm8";
-}
-
def uimm4_ptr : Operand<iPTR> {
let PrintMethod = "printUnsignedImm8";
}
@@ -95,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> {
let PrintMethod = "printUnsignedImm8";
}
-def uimm8 : Operand<i32> {
- let PrintMethod = "printUnsignedImm8";
-}
-
def simm5 : Operand<i32>;
def vsplat_uimm1 : Operand<vAny> {
@@ -639,7 +618,6 @@ class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
-class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>;
class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
@@ -1195,47 +1173,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
InstrItinClass Itinerary = itin;
}
-// This class is deprecated and will be removed soon.
-class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
- InstrItinClass itin = NoItinerary> {
- dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWS:$ws, uimm3:$m);
- string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
- list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
- InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
- InstrItinClass itin = NoItinerary> {
- dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWS:$ws, uimm4:$m);
- string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
- list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
- InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
- InstrItinClass itin = NoItinerary> {
- dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWS:$ws, uimm5:$m);
- string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
- list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
- InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
- InstrItinClass itin = NoItinerary> {
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
+ InstrItinClass itin = NoItinerary> {
dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+ dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
- list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+ list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
InstrItinClass Itinerary = itin;
}
@@ -1291,13 +1236,14 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
}
class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ RegisterOperand ROWD, RegisterOperand ROWS,
+ Operand ImmOp, ImmLeaf Imm,
InstrItinClass itin = NoItinerary> {
dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n);
+ dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n);
string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
- immZExt4:$n))];
+ Imm:$n))];
string Constraints = "$wd = $wd_in";
InstrItinClass Itinerary = itin;
}
@@ -1479,7 +1425,7 @@ class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
list<dag> Pattern = [];
- InstrItinClass Itinerary = IIBranch;
+ InstrItinClass Itinerary = NoItinerary;
bit isBranch = 1;
bit isTerminator = 1;
bit hasDelaySlot = 1;
@@ -1519,13 +1465,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
}
class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
- RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+ Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+ RegisterOperand ROWS = ROWD,
InstrItinClass itin = NoItinerary> {
dag OutOperandList = (outs ROWD:$wd);
- dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+ dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
- immZExt6:$n,
+ Imm:$n,
ROWS:$ws,
immz:$n2))];
InstrItinClass Itinerary = itin;
@@ -1934,8 +1881,6 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
GPR32Opnd, MSA128HOpnd>;
class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
GPR32Opnd, MSA128WOpnd>;
-class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64,
- GPR64Opnd, MSA128DOpnd>;
class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
MSA128W>;
@@ -2346,13 +2291,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
class INSERT_FD_VIDX64_PSEUDO_DESC :
MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
MSA128DOpnd>;
class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2381,7 +2326,7 @@ class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
RegisterOperand RORS = RORD, RegisterOperand RORT = RORD,
InstrItinClass itin = NoItinerary > {
dag OutOperandList = (outs RORD:$rd);
- dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa);
+ dag InOperandList = (ins RORS:$rs, RORT:$rt, uimm2_plus1:$sa);
string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt,
(shl RORS:$rs,
@@ -2561,23 +2506,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
-class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
- MSA128BOpnd>;
-class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
- MSA128HOpnd>;
-class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
- MSA128WOpnd>;
-class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
- MSA128DOpnd>;
-
-class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
- MSA128BOpnd>;
-class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
- MSA128HOpnd>;
-class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
- MSA128WOpnd>;
-class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
- MSA128DOpnd>;
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+ immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+ immZExt6, MSA128DOpnd>;
class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2589,13 +2534,17 @@ class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
- MSA128BOpnd>;
+ MSA128BOpnd, MSA128BOpnd, uimm4,
+ immZExt4>;
class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
- MSA128HOpnd>;
+ MSA128HOpnd, MSA128HOpnd, uimm3,
+ immZExt3>;
class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
- MSA128WOpnd>;
+ MSA128WOpnd, MSA128WOpnd, uimm2,
+ immZExt2>;
class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
- MSA128DOpnd>;
+ MSA128DOpnd, MSA128DOpnd, uimm1,
+ immZExt1>;
class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
@@ -2648,14 +2597,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
-class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
- MSA128BOpnd>;
-class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
- MSA128HOpnd>;
-class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
- MSA128WOpnd>;
-class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
- MSA128DOpnd>;
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+ immZExt6, MSA128DOpnd>;
class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2676,14 +2625,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
-class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
- MSA128BOpnd>;
-class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
- MSA128HOpnd>;
-class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
- MSA128WOpnd>;
-class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
- MSA128DOpnd>;
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+ immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+ immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+ immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+ immZExt6, MSA128DOpnd>;
class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
ValueType TyNode, RegisterOperand ROWD,
@@ -2991,12 +2940,11 @@ def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
-def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64;
def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
-def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
-def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64;
def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
@@ -3108,7 +3056,7 @@ def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
def FILL_B : FILL_B_ENC, FILL_B_DESC;
def FILL_H : FILL_H_ENC, FILL_H_DESC;
def FILL_W : FILL_W_ENC, FILL_W_DESC;
-def FILL_D : FILL_D_ENC, FILL_D_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64;
def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
@@ -3238,7 +3186,7 @@ def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
-def INSERT_D : INSERT_D_ENC, INSERT_D_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64;
// INSERT_FW_PSEUDO defined after INSVE_W
// INSERT_FD_PSEUDO defined after INSVE_D
@@ -3280,7 +3228,7 @@ def LDI_W : LDI_W_ENC, LDI_W_DESC;
def LDI_D : LDI_D_ENC, LDI_D_DESC;
def LSA : LSA_ENC, LSA_DESC;
-def DLSA : DLSA_ENC, DLSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64;
def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
@@ -3787,6 +3735,28 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
MSA128B, NoItinerary>;
+// Vector extraction with fixed index.
+//
+// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than
+// COPY_U_W, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a MIPS64 can execute MIPS32 code without getting
+// different register values.
+def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx),
+ (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx),
+ (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+
+// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than
+// COPY_U_D, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64
+// code without getting different register values.
+def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx),
+ (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx),
+ (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+
// Vector extraction with variable index
def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
(SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 0d1ee046f0dc..c7d2738af1d4 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -24,42 +24,6 @@ static cl::opt<bool>
FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
cl::desc("Always use $gp as the global base register."));
-// class MipsCallEntry.
-MipsCallEntry::MipsCallEntry(StringRef N) {
-#ifndef NDEBUG
- Name = N;
- Val = nullptr;
-#endif
-}
-
-MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
-#ifndef NDEBUG
- Val = V;
-#endif
-}
-
-bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
- return false;
-}
-
-bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
- return false;
-}
-
-bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
- return false;
-}
-
-void MipsCallEntry::printCustom(raw_ostream &O) const {
- O << "MipsCallEntry: ";
-#ifndef NDEBUG
- if (Val)
- O << Val->getName();
- else
- O << Name;
-#endif
-}
-
MipsFunctionInfo::~MipsFunctionInfo() {}
bool MipsFunctionInfo::globalBaseRegSet() const {
@@ -111,27 +75,32 @@ void MipsFunctionInfo::createEhDataRegsFI() {
}
}
+void MipsFunctionInfo::createISRRegFI() {
+ // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers.
+ // The current implementation only supports Mips32r2+ not Mips64rX. Status
+ // is always 32 bits, ErrorPC is 32 or 64 bits dependant on architecture,
+ // however Mips32r2+ is the supported architecture.
+ const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+ for (int I = 0; I < 2; ++I)
+ ISRDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(
+ RC->getSize(), RC->getAlignment(), false);
+}
+
bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
|| FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
}
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(StringRef Name) {
- std::unique_ptr<const MipsCallEntry> &E = ExternalCallEntries[Name];
-
- if (!E)
- E = llvm::make_unique<MipsCallEntry>(Name);
-
- return MachinePointerInfo(E.get());
+bool MipsFunctionInfo::isISRRegFI(int FI) const {
+ return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]);
+}
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) {
+ return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES));
}
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
- std::unique_ptr<const MipsCallEntry> &E = GlobalCallEntries[Val];
-
- if (!E)
- E = llvm::make_unique<MipsCallEntry>(Val);
-
- return MachinePointerInfo(E.get());
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
+ return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV));
}
int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 32436efa2eda..a2f6ee03604f 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -15,12 +15,10 @@
#define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
#include "Mips16HardFloatInfo.h"
-#include "llvm/ADT/StringMap.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
@@ -30,31 +28,13 @@
namespace llvm {
-/// \brief A class derived from PseudoSourceValue that represents a GOT entry
-/// resolved by lazy-binding.
-class MipsCallEntry : public PseudoSourceValue {
-public:
- explicit MipsCallEntry(StringRef N);
- explicit MipsCallEntry(const GlobalValue *V);
- bool isConstant(const MachineFrameInfo *) const override;
- bool isAliased(const MachineFrameInfo *) const override;
- bool mayAlias(const MachineFrameInfo *) const override;
-
-private:
- void printCustom(raw_ostream &O) const override;
-#ifndef NDEBUG
- std::string Name;
- const GlobalValue *Val;
-#endif
-};
-
/// MipsFunctionInfo - This class is derived from MachineFunction private
/// Mips target-specific information for each MachineFunction.
class MipsFunctionInfo : public MachineFunctionInfo {
public:
MipsFunctionInfo(MachineFunction &MF)
: MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
- VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false),
+ VarArgsFrameIndex(0), CallsEhReturn(false), IsISR(false), SaveS2(false),
MoveF64ViaSpillFI(-1) {}
~MipsFunctionInfo();
@@ -86,13 +66,21 @@ public:
int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
bool isEhDataRegFI(int FI) const;
- /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
- /// representing a GOT entry for an external function.
- MachinePointerInfo callPtrInfo(StringRef Name);
+ /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue
+ /// object representing a GOT entry for an external function.
+ MachinePointerInfo callPtrInfo(const char *ES);
+
+ // Functions with the "interrupt" attribute require special prologues,
+ // epilogues and additional spill slots.
+ bool isISR() const { return IsISR; }
+ void setISR() { IsISR = true; }
+ void createISRRegFI();
+ int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; }
+ bool isISRRegFI(int FI) const;
- /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+ /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object
/// representing a GOT entry for a global function.
- MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+ MachinePointerInfo callPtrInfo(const GlobalValue *GV);
void setSaveS2() { SaveS2 = true; }
bool hasSaveS2() const { return SaveS2; }
@@ -136,17 +124,18 @@ private:
/// Frame objects for spilling eh data registers.
int EhDataRegFI[4];
+ /// ISR - Whether the function is an Interrupt Service Routine.
+ bool IsISR;
+
+ /// Frame objects for spilling C0_STATUS, C0_EPC
+ int ISRDataRegFI[2];
+
// saveS2
bool SaveS2;
/// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
/// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
int MoveF64ViaSpillFI;
-
- /// MipsCallEntry maps.
- StringMap<std::unique_ptr<const MipsCallEntry>> ExternalCallEntries;
- ValueMap<const GlobalValue *, std::unique_ptr<const MipsCallEntry>>
- GlobalCallEntries;
};
} // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index f6647e6a8468..28e5a425849f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -84,6 +84,16 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const MCPhysReg *
MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>();
+ const Function *F = MF->getFunction();
+ if (F->hasFnAttribute("interrupt")) {
+ if (Subtarget.hasMips64())
+ return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList
+ : CSR_Interrupt_64_SaveList;
+ else
+ return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList
+ : CSR_Interrupt_32_SaveList;
+ }
+
if (Subtarget.isSingleFloat())
return CSR_SingleFloatOnly_SaveList;
@@ -284,6 +294,16 @@ getFrameRegister(const MachineFunction &MF) const {
}
bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ // Avoid realigning functions that explicitly do not want to be realigned.
+ // Normally, we should report an error when a function should be dynamically
+ // realigned but also has the attribute no-realign-stack. Unfortunately,
+ // with this attribute, MachineFrameInfo clamps each new object's alignment
+ // to that of the stack's alignment as specified by the ABI. As a result,
+ // the information of whether we have objects with larger alignment
+ // requirement than the stack's alignment is already lost at this point.
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
@@ -306,42 +326,3 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
// sized objects.
return MF.getRegInfo().canReserveReg(BP);
}
-
-bool MipsRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
- const MachineFrameInfo *MFI = MF.getFrameInfo();
-
- bool CanRealign = canRealignStack(MF);
-
- // Avoid realigning functions that explicitly do not want to be realigned.
- // Normally, we should report an error when a function should be dynamically
- // realigned but also has the attribute no-realign-stack. Unfortunately,
- // with this attribute, MachineFrameInfo clamps each new object's alignment
- // to that of the stack's alignment as specified by the ABI. As a result,
- // the information of whether we have objects with larger alignment
- // requirement than the stack's alignment is already lost at this point.
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
- return false;
-
- const Function *F = MF.getFunction();
- if (F->hasFnAttribute(Attribute::StackAlignment)) {
-#ifdef DEBUG
- if (!CanRealign)
- DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
- << F->getName() << "\n");
-#endif
- return CanRealign;
- }
-
- unsigned StackAlignment = Subtarget.getFrameLowering()->getStackAlignment();
- if (MFI->getMaxAlignment() > StackAlignment) {
-#ifdef DEBUG
- if (!CanRealign)
- DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
- << F->getName() << "\n");
-#endif
- return CanRealign;
- }
-
- return false;
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index ee1f6bcd7390..5de68a21b73e 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -61,9 +61,7 @@ public:
RegScavenger *RS = nullptr) const;
// Stack realignment queries.
- bool canRealignStack(const MachineFunction &MF) const;
-
- bool needsStackRealignment(const MachineFunction &MF) const override;
+ bool canRealignStack(const MachineFunction &MF) const override;
/// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 096b3bee5d07..a4abd62ee607 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -17,6 +17,7 @@
#include "MipsMachineFunction.h"
#include "MipsSEInstrInfo.h"
#include "MipsSubtarget.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -319,6 +320,15 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
bool FP64) const {
+ const MachineOperand &Op1 = I->getOperand(1);
+ const MachineOperand &Op2 = I->getOperand(2);
+
+ if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
+ unsigned DstReg = I->getOperand(0).getReg();
+ BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
+ return true;
+ }
+
// For fpxx and when mfhc1 is not available, use:
// spill + reload via ldc1
//
@@ -335,8 +345,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
(FP64 && !Subtarget.useOddSPReg())) {
unsigned DstReg = I->getOperand(0).getReg();
- unsigned SrcReg = I->getOperand(1).getReg();
- unsigned N = I->getOperand(2).getImm();
+ unsigned SrcReg = Op1.getReg();
+ unsigned N = Op2.getImm();
int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
// It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
@@ -352,8 +362,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
// We re-use the same spill slot each time so that the stack frame doesn't
// grow too much in functions with a large number of moves.
int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
- TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC,
- &RegInfo, 0);
+ TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
return true;
}
@@ -376,12 +385,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
*static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
MachineBasicBlock::iterator MBBI = MBB.begin();
- DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ DebugLoc dl;
MipsABIInfo ABI = STI.getABI();
unsigned SP = ABI.GetStackPtr();
unsigned FP = ABI.GetFramePtr();
unsigned ZERO = ABI.GetNullPtr();
- unsigned ADDu = ABI.GetPtrAdduOp();
+ unsigned MOVE = ABI.GetGPRMoveOp();
unsigned ADDiu = ABI.GetPtrAddiuOp();
unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND;
@@ -407,6 +416,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
+ if (MF.getFunction()->hasFnAttribute("interrupt"))
+ emitInterruptPrologueStub(MF, MBB);
+
const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
if (CSI.size()) {
@@ -491,7 +503,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
// if framepointer enabled, set it to point to the stack pointer.
if (hasFP(MF)) {
// Insert instruction "move $fp, $sp" at this location.
- BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO)
+ BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO)
.setMIFlag(MachineInstr::FrameSetup);
// emit ".cfi_def_cfa_register $fp"
@@ -514,7 +526,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
if (hasBP(MF)) {
// move $s7, $sp
unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7;
- BuildMI(MBB, MBBI, dl, TII.get(ADDu), BP)
+ BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP)
.addReg(SP)
.addReg(ZERO);
}
@@ -522,6 +534,135 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
}
}
+void MipsSEFrameLowering::emitInterruptPrologueStub(
+ MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Report an error the target doesn't support Mips32r2 or later.
+ // The epilogue relies on the use of the "ehb" to clear execution
+ // hazards. Pre R2 Mips relies on an implementation defined number
+ // of "ssnop"s to clear the execution hazard. Support for ssnop hazard
+ // clearing is not provided so reject that configuration.
+ if (!STI.hasMips32r2())
+ report_fatal_error(
+ "\"interrupt\" attribute is not supported on pre-MIPS32R2 or "
+ "MIPS16 targets.");
+
+ // The GP register contains the "user" value, so we cannot perform
+ // any gp relative loads until we restore the "kernel" or "system" gp
+ // value. Until support is written we shall only accept the static
+ // relocation model.
+ if ((STI.getRelocationModel() != Reloc::Static))
+ report_fatal_error("\"interrupt\" attribute is only supported for the "
+ "static relocation model on MIPS at the present time.");
+
+ if (!STI.isABI_O32() || STI.hasMips64())
+ report_fatal_error("\"interrupt\" attribute is only supported for the "
+ "O32 ABI on MIPS32R2+ at the present time.");
+
+ // Perform ISR handling like GCC
+ StringRef IntKind =
+ MF.getFunction()->getFnAttribute("interrupt").getValueAsString();
+ const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+ // EIC interrupt handling needs to read the Cause register to disable
+ // interrupts.
+ if (IntKind == "eic") {
+ // Coprocessor registers are always live per se.
+ MBB.addLiveIn(Mips::COP013);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0)
+ .addReg(Mips::COP013)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0)
+ .addReg(Mips::K0)
+ .addImm(10)
+ .addImm(6)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Fetch and spill EPC
+ MBB.addLiveIn(Mips::COP014);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+ .addReg(Mips::COP014)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+ MipsFI->getISRRegFI(0), PtrRC,
+ STI.getRegisterInfo(), 0);
+
+ // Fetch and Spill Status
+ MBB.addLiveIn(Mips::COP012);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+ .addReg(Mips::COP012)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+ MipsFI->getISRRegFI(1), PtrRC,
+ STI.getRegisterInfo(), 0);
+
+ // Build the configuration for disabling lower priority interrupts. Non EIC
+ // interrupts need to be masked off with zero, EIC from the Cause register.
+ unsigned InsPosition = 8;
+ unsigned InsSize = 0;
+ unsigned SrcReg = Mips::ZERO;
+
+ // If the interrupt we're tied to is the EIC, switch the source for the
+ // masking off interrupts to the cause register.
+ if (IntKind == "eic") {
+ SrcReg = Mips::K0;
+ InsPosition = 10;
+ InsSize = 6;
+ } else
+ InsSize = StringSwitch<unsigned>(IntKind)
+ .Case("sw0", 1)
+ .Case("sw1", 2)
+ .Case("hw0", 3)
+ .Case("hw1", 4)
+ .Case("hw2", 5)
+ .Case("hw3", 6)
+ .Case("hw4", 7)
+ .Case("hw5", 8)
+ .Default(0);
+ assert(InsSize != 0 && "Unknown interrupt type!");
+
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(SrcReg)
+ .addImm(InsPosition)
+ .addImm(InsSize)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Mask off KSU, ERL, EXL
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(Mips::ZERO)
+ .addImm(1)
+ .addImm(4)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Disable the FPU as we are not spilling those register sets.
+ if (!STI.useSoftFloat())
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+ .addReg(Mips::ZERO)
+ .addImm(29)
+ .addImm(1)
+ .addReg(Mips::K1)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // Set the new status
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+ .addReg(Mips::K1)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+}
+
void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -533,12 +674,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
const MipsRegisterInfo &RegInfo =
*static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
- DebugLoc dl = MBBI->getDebugLoc();
+ DebugLoc DL = MBBI->getDebugLoc();
MipsABIInfo ABI = STI.getABI();
unsigned SP = ABI.GetStackPtr();
unsigned FP = ABI.GetFramePtr();
unsigned ZERO = ABI.GetNullPtr();
- unsigned ADDu = ABI.GetPtrAdduOp();
+ unsigned MOVE = ABI.GetGPRMoveOp();
// if framepointer enabled, restore the stack pointer.
if (hasFP(MF)) {
@@ -549,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
--I;
// Insert instruction "move $sp, $fp" at this location.
- BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
+ BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO);
}
if (MipsFI->callsEhReturn()) {
@@ -568,6 +709,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
}
}
+ if (MF.getFunction()->hasFnAttribute("interrupt"))
+ emitInterruptEpilogueStub(MF, MBB);
+
// Get the number of bytes from FrameInfo
uint64_t StackSize = MFI->getStackSize();
@@ -578,13 +722,59 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
}
+void MipsSEFrameLowering::emitInterruptEpilogueStub(
+ MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+ MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Perform ISR handling like GCC
+ const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+ // Disable Interrupts.
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO);
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
+
+ // Restore EPC
+ STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+ MipsFI->getISRRegFI(0), PtrRC,
+ STI.getRegisterInfo());
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
+ .addReg(Mips::K1)
+ .addImm(0);
+
+ // Restore Status
+ STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+ MipsFI->getISRRegFI(1), PtrRC,
+ STI.getRegisterInfo());
+ BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+ .addReg(Mips::K1)
+ .addImm(0);
+}
+
+int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ MipsABIInfo ABI = STI.getABI();
+
+ if (MFI->isFixedObjectIndex(FI))
+ FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr();
+ else
+ FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
+
+ return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+ getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
bool MipsSEFrameLowering::
spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- MachineBasicBlock *EntryBlock = MF->begin();
+ MachineBasicBlock *EntryBlock = &MF->front();
const TargetInstrInfo &TII = *STI.getInstrInfo();
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -599,6 +789,26 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
if (!IsRAAndRetAddrIsTaken)
EntryBlock->addLiveIn(Reg);
+ // ISRs require HI/LO to be spilled into kernel registers to be then
+ // spilled to the stack frame.
+ bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 ||
+ Reg == Mips::HI0 || Reg == Mips::HI0_64);
+ const Function *Func = MBB.getParent()->getFunction();
+ if (IsLOHI && Func->hasFnAttribute("interrupt")) {
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned Op = 0;
+ if (!STI.getABI().ArePtrs64bit()) {
+ Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO;
+ Reg = Mips::K0;
+ } else {
+ Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64;
+ Reg = Mips::K0_64;
+ }
+ BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
// Insert the spill to the stack frame.
bool IsKill = !IsRAAndRetAddrIsTaken;
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
@@ -622,7 +832,8 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
}
/// Mark \p Reg and all registers aliasing it in the bitset.
-void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) {
+static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs,
+ unsigned Reg) {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
SavedRegs.set(*AI);
@@ -648,6 +859,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MipsFI->callsEhReturn())
MipsFI->createEhDataRegsFI();
+ // Create spill slots for Coprocessor 0 registers if function is an ISR.
+ if (MipsFI->isISR())
+ MipsFI->createISRRegFI();
+
// Expand pseudo instructions which load, store or copy accumulators.
// Add an emergency spill slot if a pseudo was expanded.
if (ExpandPseudo(MF).expand()) {
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 9cb32e6c7829..63cd3cebc56a 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -27,6 +27,9 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -37,8 +40,13 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
unsigned ehDataReg(unsigned I) const;
-};
+private:
+ void emitInterruptEpilogueStub(MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+ void emitInterruptPrologueStub(MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
+};
} // End llvm namespace
#endif
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 2ebfbd17d7d0..6f001ea74b30 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
MachineBasicBlock::iterator I = MBB.begin();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
const TargetRegisterClass *RC;
const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index b319fd07884b..efe22fba98ce 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1181,6 +1181,10 @@ bool MipsSETargetLowering::isEligibleForTailCallOptimization(
if (!EnableMipsTailCalls)
return false;
+ // Exception has to be cleared with eret.
+ if (FI.isISR())
+ return false;
+
// Return false if either the callee or caller has a byval argument.
if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg())
return false;
@@ -1786,9 +1790,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2));
case Intrinsic::mips_fadd_w:
- case Intrinsic::mips_fadd_d:
+ case Intrinsic::mips_fadd_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2));
+ }
// Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
case Intrinsic::mips_fceq_w:
case Intrinsic::mips_fceq_d:
@@ -1831,9 +1837,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2), ISD::SETUNE);
case Intrinsic::mips_fdiv_w:
- case Intrinsic::mips_fdiv_d:
+ case Intrinsic::mips_fdiv_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2));
+ }
case Intrinsic::mips_ffint_u_w:
case Intrinsic::mips_ffint_u_d:
return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
@@ -1856,6 +1864,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::mips_fexp2_w:
case Intrinsic::mips_fexp2_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
EVT ResTy = Op->getValueType(0);
return DAG.getNode(
ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
@@ -1869,11 +1878,14 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
case Intrinsic::mips_fmul_w:
- case Intrinsic::mips_fmul_d:
+ case Intrinsic::mips_fmul_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2));
+ }
case Intrinsic::mips_fmsub_w:
case Intrinsic::mips_fmsub_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
EVT ResTy = Op->getValueType(0);
return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
@@ -1886,9 +1898,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::mips_fsqrt_d:
return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
case Intrinsic::mips_fsub_w:
- case Intrinsic::mips_fsub_d:
+ case Intrinsic::mips_fsub_d: {
+ // TODO: If intrinsics have fast-math-flags, propagate them.
return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
Op->getOperand(2));
+ }
case Intrinsic::mips_ftrunc_u_w:
case Intrinsic::mips_ftrunc_u_d:
return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 786307b95f88..e6f7fe9aae1d 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -88,7 +88,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (isMicroMips)
Opc = Mips::MOVE16_MM;
else
- Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+ Opc = Mips::OR, ZeroReg = Mips::ZERO;
} else if (Mips::CCRRegClass.contains(SrcReg))
Opc = Mips::CFC1;
else if (Mips::FGR32RegClass.contains(SrcReg))
@@ -141,7 +141,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = Mips::FMOV_D64;
else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
if (Mips::GPR64RegClass.contains(SrcReg))
- Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+ Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
else if (Mips::HI64RegClass.contains(SrcReg))
Opc = Mips::MFHI64, SrcReg = 0;
else if (Mips::LO64RegClass.contains(SrcReg))
@@ -182,7 +182,6 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
int64_t Offset) const {
DebugLoc DL;
- if (I != MBB.end()) DL = I->getDebugLoc();
MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
unsigned Opc = 0;
@@ -213,6 +212,33 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
Opc = Mips::ST_W;
else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
Opc = Mips::ST_D;
+ else if (Mips::LO32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SW;
+ else if (Mips::LO64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SD;
+ else if (Mips::HI32RegClass.hasSubClassEq(RC))
+ Opc = Mips::SW;
+ else if (Mips::HI64RegClass.hasSubClassEq(RC))
+ Opc = Mips::SD;
+
+ // Hi, Lo are normally caller save but they are callee save
+ // for interrupt handling.
+ const Function *Func = MBB.getParent()->getFunction();
+ if (Func->hasFnAttribute("interrupt")) {
+ if (Mips::HI32RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0);
+ SrcReg = Mips::K0;
+ } else if (Mips::HI64RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64);
+ SrcReg = Mips::K0_64;
+ } else if (Mips::LO32RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0);
+ SrcReg = Mips::K0;
+ } else if (Mips::LO64RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64);
+ SrcReg = Mips::K0_64;
+ }
+ }
assert(Opc && "Register class not handled!");
BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
@@ -228,6 +254,11 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
unsigned Opc = 0;
+ const Function *Func = MBB.getParent()->getFunction();
+ bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") &&
+ (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 ||
+ DestReg == Mips::HI0 || DestReg == Mips::HI0_64);
+
if (Mips::GPR32RegClass.hasSubClassEq(RC))
Opc = Mips::LW;
else if (Mips::GPR64RegClass.hasSubClassEq(RC))
@@ -254,10 +285,44 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
Opc = Mips::LD_W;
else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
Opc = Mips::LD_D;
+ else if (Mips::HI32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LW;
+ else if (Mips::HI64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LD;
+ else if (Mips::LO32RegClass.hasSubClassEq(RC))
+ Opc = Mips::LW;
+ else if (Mips::LO64RegClass.hasSubClassEq(RC))
+ Opc = Mips::LD;
assert(Opc && "Register class not handled!");
- BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
- .addMemOperand(MMO);
+
+ if (!ReqIndirectLoad)
+ BuildMI(MBB, I, DL, get(Opc), DestReg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ else {
+ // Load HI/LO through K0. Notably the DestReg is encoded into the
+ // instruction itself.
+ unsigned Reg = Mips::K0;
+ unsigned LdOp = Mips::MTLO;
+ if (DestReg == Mips::HI0)
+ LdOp = Mips::MTHI;
+
+ if (Subtarget.getABI().ArePtrs64bit()) {
+ Reg = Mips::K0_64;
+ if (DestReg == Mips::HI0_64)
+ LdOp = Mips::MTHI64;
+ else
+ LdOp = Mips::MTLO64;
+ }
+
+ BuildMI(MBB, I, DL, get(Opc), Reg)
+ .addFrameIndex(FI)
+ .addImm(Offset)
+ .addMemOperand(MMO);
+ BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg);
+ }
}
bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
@@ -271,6 +336,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
case Mips::RetRA:
expandRetRA(MBB, MI);
break;
+ case Mips::ERet:
+ expandERet(MBB, MI);
+ break;
case Mips::PseudoMFHI:
Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
expandPseudoMFHiLo(MBB, MI, Opc);
@@ -360,7 +428,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
MipsABIInfo ABI = Subtarget.getABI();
- DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
unsigned ADDu = ABI.GetPtrAdduOp();
unsigned ADDiu = ABI.GetPtrAddiuOp();
@@ -438,6 +506,11 @@ void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
}
+void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET));
+}
+
std::pair<bool, bool>
MipsSEInstrInfo::compareOpndSize(unsigned Opc,
const MachineFunction &MF) const {
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index bebbabf7b838..5d73545ef6b9 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -82,6 +82,8 @@ private:
void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+ void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
std::pair<bool, bool> compareOpndSize(unsigned Opc,
const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 132c3a1001ad..b1e2885f5ba3 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -126,17 +126,19 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
}
bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
-
+ bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex);
// The following stack frame objects are always referenced relative to $sp:
// 1. Outgoing arguments.
// 2. Pointer to dynamically allocated stack space.
// 3. Locations for callee-saved registers.
// 4. Locations for eh data registers.
+ // 5. Locations for ISR saved Coprocessor 0 registers 12 & 14.
// Everything else is referenced relative to whatever register
// getFrameRegister() returns.
unsigned FrameReg;
- if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
+ if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI ||
+ IsISRRegFI)
FrameReg = ABI.GetStackPtr();
else if (RegInfo->needsStackRealignment(MF)) {
if (MFI->hasVarSizedObjects() && !MFI->isFixedObjectIndex(FrameIndex))
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 54b5d2811701..37f9e491d546 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -16,8 +16,8 @@ def IMULDIV : FuncUnit;
//===----------------------------------------------------------------------===//
// Instruction Itinerary classes used for Mips
//===----------------------------------------------------------------------===//
-def IIAlu : InstrItinClass;
-def IIBranch : InstrItinClass;
+// IIM16Alu is a placeholder class for most MIPS16 instructions.
+def IIM16Alu : InstrItinClass;
def IIPseudo : InstrItinClass;
def II_ABS : InstrItinClass;
@@ -28,7 +28,19 @@ def II_ADD_D : InstrItinClass;
def II_ADD_S : InstrItinClass;
def II_AND : InstrItinClass;
def II_ANDI : InstrItinClass;
+def II_B : InstrItinClass;
def II_BADDU : InstrItinClass;
+def II_BBIT : InstrItinClass; // bbit[01], bbit[01]32
+def II_BC : InstrItinClass;
+def II_BC1F : InstrItinClass;
+def II_BC1FL : InstrItinClass;
+def II_BC1T : InstrItinClass;
+def II_BC1TL : InstrItinClass;
+def II_BCC : InstrItinClass; // beq and bne
+def II_BCCZ : InstrItinClass; // b[gl][et]z
+def II_BCCZAL : InstrItinClass; // bgezal and bltzal
+def II_BCCZALS : InstrItinClass; // bgezals and bltzals
+def II_BCCZC : InstrItinClass; // beqzc, bnezc
def II_CEIL : InstrItinClass;
def II_CFC1 : InstrItinClass;
def II_CLO : InstrItinClass;
@@ -68,21 +80,39 @@ def II_DSUB : InstrItinClass;
def II_EXT : InstrItinClass; // Any EXT instruction
def II_FLOOR : InstrItinClass;
def II_INS : InstrItinClass; // Any INS instruction
+def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
+def II_J : InstrItinClass;
+def II_JAL : InstrItinClass;
+def II_JALR : InstrItinClass;
+def II_JALRC : InstrItinClass;
+def II_JALRS : InstrItinClass;
+def II_JALS : InstrItinClass;
+def II_JR : InstrItinClass;
+def II_JRADDIUSP : InstrItinClass;
+def II_JRC : InstrItinClass;
+def II_ReturnPseudo : InstrItinClass; // Return pseudo.
def II_LB : InstrItinClass;
+def II_LBE : InstrItinClass;
def II_LBU : InstrItinClass;
+def II_LBUE : InstrItinClass;
def II_LD : InstrItinClass;
def II_LDC1 : InstrItinClass;
def II_LDL : InstrItinClass;
def II_LDR : InstrItinClass;
def II_LDXC1 : InstrItinClass;
def II_LH : InstrItinClass;
+def II_LHE : InstrItinClass;
def II_LHU : InstrItinClass;
+def II_LHUE : InstrItinClass;
def II_LUI : InstrItinClass;
def II_LUXC1 : InstrItinClass;
def II_LW : InstrItinClass;
+def II_LWE : InstrItinClass;
def II_LWC1 : InstrItinClass;
def II_LWL : InstrItinClass;
+def II_LWLE : InstrItinClass;
def II_LWR : InstrItinClass;
+def II_LWRE : InstrItinClass;
def II_LWU : InstrItinClass;
def II_LWXC1 : InstrItinClass;
def II_MADD : InstrItinClass;
@@ -134,6 +164,7 @@ def II_ROTRV : InstrItinClass;
def II_ROUND : InstrItinClass;
def II_SAVE : InstrItinClass;
def II_SB : InstrItinClass;
+def II_SBE : InstrItinClass;
def II_SD : InstrItinClass;
def II_SDC1 : InstrItinClass;
def II_SDL : InstrItinClass;
@@ -144,6 +175,7 @@ def II_SEH : InstrItinClass;
def II_SEQ_SNE : InstrItinClass; // seq and sne
def II_SEQI_SNEI : InstrItinClass; // seqi and snei
def II_SH : InstrItinClass;
+def II_SHE : InstrItinClass;
def II_SLL : InstrItinClass;
def II_SLLV : InstrItinClass;
def II_SLTI_SLTIU : InstrItinClass; // slti and sltiu
@@ -159,11 +191,15 @@ def II_SUB_D : InstrItinClass;
def II_SUB_S : InstrItinClass;
def II_SUXC1 : InstrItinClass;
def II_SW : InstrItinClass;
+def II_SWE : InstrItinClass;
def II_SWC1 : InstrItinClass;
def II_SWL : InstrItinClass;
+def II_SWLE : InstrItinClass;
def II_SWR : InstrItinClass;
+def II_SWRE : InstrItinClass;
def II_SWXC1 : InstrItinClass;
def II_TRUNC : InstrItinClass;
+def II_WSBH : InstrItinClass;
def II_XOR : InstrItinClass;
def II_XORI : InstrItinClass;
@@ -171,7 +207,7 @@ def II_XORI : InstrItinClass;
// Mips Generic instruction itineraries.
//===----------------------------------------------------------------------===//
def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
- InstrItinData<IIAlu , [InstrStage<1, [ALU]>]>,
+ InstrItinData<IIM16Alu , [InstrStage<1, [ALU]>]>,
InstrItinData<II_ADDI , [InstrStage<1, [ALU]>]>,
InstrItinData<II_ADDIU , [InstrStage<1, [ALU]>]>,
InstrItinData<II_ADDU , [InstrStage<1, [ALU]>]>,
@@ -240,7 +276,29 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
InstrItinData<II_SAVE , [InstrStage<1, [ALU]>]>,
InstrItinData<II_SEQ_SNE , [InstrStage<1, [ALU]>]>,
InstrItinData<II_SEQI_SNEI , [InstrStage<1, [ALU]>]>,
- InstrItinData<IIBranch , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_B , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BBIT , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1F , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1FL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1T , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BC1TL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZ , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZAL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZALS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_BCCZC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_IndirectBranchPseudo, [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_J , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JAL , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALRC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALRS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JALS , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JR , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JRADDIUSP , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_JRC , [InstrStage<1, [ALU]>]>,
+ InstrItinData<II_ReturnPseudo , [InstrStage<1, [ALU]>]>,
InstrItinData<II_DMUL , [InstrStage<17, [IMULDIV]>]>,
InstrItinData<II_DMULT , [InstrStage<17, [IMULDIV]>]>,
InstrItinData<II_DMULTU , [InstrStage<17, [IMULDIV]>]>,
@@ -313,3 +371,5 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>,
InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]>
]>;
+
+include "MipsScheduleP5600.td"
diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td
new file mode 100644
index 000000000000..d32ae4f55eaf
--- /dev/null
+++ b/lib/Target/Mips/MipsScheduleP5600.td
@@ -0,0 +1,392 @@
+//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MipsP5600Model : SchedMachineModel {
+ int IssueWidth = 2; // 2x dispatched per cycle
+ int MicroOpBufferSize = 48; // min(48, 48, 64)
+ int LoadLatency = 4;
+ int MispredictPenalty = 8; // TODO: Estimated
+
+ let CompleteModel = 1;
+}
+
+let SchedModel = MipsP5600Model in {
+
+// ALQ Pipelines
+// =============
+
+def P5600ALQ : ProcResource<1> { let BufferSize = 16; }
+def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
+
+// ALU Pipeline
+// ------------
+
+def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+def : ItinRW<[P5600WriteALU],
+ [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUBU, II_XOR]>;
+
+// AGQ Pipelines
+// =============
+
+def P5600AGQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; }
+
+def P5600AL2Div : ProcResource<1>;
+// Pseudo-resource used to block CTISTD when handling multi-pipeline splits.
+def P5600CTISTD : ProcResource<1>;
+
+// CTISTD Pipeline
+// ---------------
+
+def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
+ let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx,
+// jalr, jr.hb, jr
+def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR]>;
+def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR]>;
+
+// LDST Pipeline
+// -------------
+
+def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 4;
+}
+
+def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+ let Latency = 4;
+}
+
+def P5600WritePref : SchedWriteRes<[P5600IssueLDST]>;
+
+def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+ // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2
+ // not during 0, 1, and 2.
+ let ResourceCycles = [ 1, 3 ];
+}
+
+def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 2;
+}
+
+def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>;
+def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[P5600WriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LWU]>;
+
+// lw[lr]
+def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWR]>;
+
+// s[bhw], sw[lr]
+def : ItinRW<[P5600WriteStore], [II_SB, II_SH, II_SW, II_SWL, II_SWR]>;
+
+// pref
+// (this instruction does not exist in the backend yet)
+def : ItinRW<[P5600WritePref], []>;
+
+// sc
+// (this instruction does not exist in the backend yet)
+def : ItinRW<[P5600WriteStore], []>;
+
+// LDST is also used in moves from general purpose registers to floating point
+// and MSA.
+def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+ let Latency = 0;
+}
+
+// AL2 Pipeline
+// ------------
+
+def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>;
+def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+ let Latency = 2;
+}
+def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+ // Estimated worst case
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+ // Estimated worst case
+ let Latency = 34;
+ let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; }
+def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; }
+def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+ let Latency = 5;
+}
+
+// clo, clz, di, mfhi, mflo
+def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_MFHI_MFLO]>;
+
+// ehb, rdhwr, rdpgpr, wrpgpr, wsbh
+def : ItinRW<[P5600WriteAL2ShadowMov], [II_RDHWR]>;
+
+// mov[nz]
+def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+
+// divu?
+def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
+def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+
+// mul
+def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+// multu?, multu?
+def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+// maddu?, msubu?, mthi, mtlo
+def : ItinRW<[P5600WriteAL2MAdd],
+ [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+
+// ext, ins
+def : ItinRW<[P5600WriteAL2BitExt],
+ [II_EXT, II_INS]>;
+
+// Either ALU or AL2 Pipelines
+// ---------------------------
+//
+// Some instructions can choose between ALU and AL2, but once dispatched to
+// ALQ or AGQ respectively they are committed to that path.
+// The decision is based on the outcome of the most recent selection when the
+// choice was last available. For now, we assume ALU is always chosen.
+
+def P5600WriteEitherALU : SchedWriteVariant<
+ // FIXME: Implement selection predicate
+ [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>,
+ SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]>
+ ]>;
+
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[P5600WriteEitherALU],
+ [II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
+ II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
+ II_SRAV, II_SRLV]>;
+
+// FPU Pipelines
+// =============
+
+def P5600FPQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; }
+
+def P5600FPUDivSqrt : ProcResource<2>;
+
+def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; }
+def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; }
+def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 23 / 27
+ let Latency = 23; // Using common case
+ let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 31 / 35
+ let Latency = 31; // Using common case
+ let ResourceCycles = [ 1, 31 ];
+}
+def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 19 / 23
+ let Latency = 19; // Using common case
+ let ResourceCycles = [ 1, 19 ];
+}
+def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 27
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 27 / 31
+ let Latency = 27; // Using common case
+ let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+ // Best/Common/Worst case = 7 / 35 / 39
+ let Latency = 35; // Using common case
+ let ResourceCycles = [ 1, 35 ];
+}
+def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; }
+def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>;
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> {
+ let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
+// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
+// sdxc1, sdc1, st.[bhwd], swc1, swxc1
+def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
+ II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd]
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// Long Pipe
+// ----------
+//
+// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
+// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
+// trunc.w.[ds], trunc.w.ps
+def : ItinRW<[P5600WriteFPUL],
+ [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
+ II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+
+// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
+// Operand 0 is read on cycle 5. All other operands are read on operand 0.
+def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+ [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
+ II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+
+// madd.ps, msub.ps, nmadd.ps, nmsub.ps
+// Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
+// (none of these instructions exist in the backend yet)
+
+// Load Pipe
+// ---------
+//
+// This is typically used in conjunction with the load pipeline under the AGQ
+// All the instructions are in the 'Tricky Instructions' section.
+
+def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> {
+ let Latency = 4;
+}
+
+// Tricky Instructions
+// ===================
+//
+// These instructions are split across multiple uops (in different pipelines)
+// that must cooperate to complete the operation
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits,
+ P5600WriteMoveOtherUnitsToFPU]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+ P5600WriteGPRFromBypass]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+ P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits,
+ P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+// current aggregates the resources and ignores the exact cycle they are
+// used.
+def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
+ P5600WriteLoadOtherUnitsToFPU]>;
+
+// ctc1, mtc1, mthc1
+def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+
+// bc1[ft], cfc1, mfc1, mfhc1, movf, movt
+def : ItinRW<[P5600WriteMoveFPUToGPR],
+ [II_BC1F, II_BC1T, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+
+// swc1, swxc1, st.[bhwd]
+def : ItinRW<[P5600WriteStoreFPUS], [II_SWC1, II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+
+// l[dw]x?c1, ld.[bhwd]
+def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
+
+// Unsupported Instructions
+// ========================
+//
+// The following instruction classes are never valid on P5600.
+// II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR,
+// II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA,
+// II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU,
+// II_JALRC, II_LD, II_LD[LR], II_LUXC1, II_RESTORE, II_SAVE, II_SD, II_SDC1,
+// II_SDL, II_SDR, II_SDXC1
+//
+// The following instructions are never valid on P5600.
+// addq.ph, rdhwr, repl.ph, repl.qb, subq.ph, subu_s.qb
+//
+// Guesswork
+// =========
+//
+// This section is largely temporary guesswork.
+
+// ceil.[lw].[ds], floor.[lw].[ds]
+// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
+def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+
+// rotrv
+// Reason behind guess: rotr is in the same category and the two register forms
+// generally follow the immediate forms in this category
+def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 471b6e19a8bb..8a18b517d16b 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -69,8 +69,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
- HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
- HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(),
+ HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+ Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
+ TargetTriple(TT), TSInfo(),
InstrInfo(
MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
FrameLowering(MipsFrameLowering::create(*this)),
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 1db8881404c9..fbb01fe77029 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -42,9 +42,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6
};
+ enum class CPU { P5600 };
+
// Mips architecture version
MipsArchEnum MipsArchVersion;
+ // Processor implementation (unused but required to exist by
+ // tablegen-erated code).
+ CPU ProcImpl;
+
// IsLittle - The target is Little Endian
bool IsLittle;
@@ -116,8 +122,8 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// InMicroMips -- can process MicroMips instructions
bool InMicroMipsMode;
- // HasDSP, HasDSPR2 -- supports DSP ASE.
- bool HasDSP, HasDSPR2;
+ // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE.
+ bool HasDSP, HasDSPR2, HasDSPR3;
// Allow mixed Mips16 and Mips32 in one source file
bool AllowMixed16_32;
@@ -130,6 +136,12 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// HasMSA -- supports MSA ASE.
bool HasMSA;
+ // UseTCCInDIV -- Enables the use of trapping in the assembler.
+ bool UseTCCInDIV;
+
+ // HasEVA -- supports EVA ASE.
+ bool HasEVA;
+
InstrItineraryData InstrItins;
// We can override the determination of whether we are in mips16 mode
@@ -189,7 +201,7 @@ public:
}
bool hasMips32r5() const {
return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) ||
- hasMips64r2();
+ hasMips64r5();
}
bool hasMips32r6() const {
return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) ||
@@ -228,9 +240,12 @@ public:
}
bool inMicroMipsMode() const { return InMicroMipsMode; }
bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); }
+ bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); }
bool hasDSP() const { return HasDSP; }
bool hasDSPR2() const { return HasDSPR2; }
+ bool hasDSPR3() const { return HasDSPR3; }
bool hasMSA() const { return HasMSA; }
+ bool hasEVA() const { return HasEVA; }
bool useSmallSection() const { return UseSmallSection; }
bool hasStandardEncoding() const { return !inMips16Mode(); }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 1c77745d130b..3e638720e839 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -233,7 +233,7 @@ void MipsPassConfig::addPreRegAlloc() {
}
TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
if (Subtarget->allowMixed16_32()) {
DEBUG(errs() << "No Target Transform Info Pass Added\n");
// FIXME: This is no longer necessary as the TTI returned is per-function.
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 0f2db6039b6a..146f33bda249 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -76,7 +76,7 @@ bool MipsTargetObjectFile::
IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
SectionKind Kind) const {
return (IsGlobalInSmallSectionImpl(GV, TM) &&
- (Kind.isDataRel() || Kind.isBSS() || Kind.isCommon()));
+ (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
}
/// Return true if this global address should be placed into small data/bss
@@ -107,7 +107,8 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV,
return false;
Type *Ty = GV->getType()->getElementType();
- return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+ return IsInSmallSection(
+ GV->getParent()->getDataLayout().getTypeAllocSize(Ty));
}
MCSection *
@@ -120,7 +121,7 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
// Handle Small Section classification here.
if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
return SmallBSSSection;
- if (Kind.isDataRel() && IsGlobalInSmallSection(GV, TM, Kind))
+ if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind))
return SmallDataSection;
// Otherwise, we work the same as ELF.
@@ -128,21 +129,20 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
}
/// Return true if this constant should be placed into small data section.
-bool MipsTargetObjectFile::
-IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const {
+bool MipsTargetObjectFile::IsConstantInSmallSection(
+ const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const {
return (static_cast<const MipsTargetMachine &>(TM)
.getSubtargetImpl()
->useSmallSection() &&
- LocalSData && IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(
- CN->getType())));
+ LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType())));
}
-MCSection *
-MipsTargetObjectFile::getSectionForConstant(SectionKind Kind,
- const Constant *C) const {
- if (IsConstantInSmallSection(C, *TM))
+/// Return true if this constant should be placed into small data section.
+MCSection *MipsTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+ if (IsConstantInSmallSection(DL, C, *TM))
return SmallDataSection;
// Otherwise, we work the same as ELF.
- return TargetLoweringObjectFileELF::getSectionForConstant(Kind, C);
+ return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C);
}
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index 725f2ffd93dd..ba04343bad87 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -36,10 +36,10 @@ class MipsTargetMachine;
const TargetMachine &TM) const override;
/// Return true if this constant should be placed into small data section.
- bool IsConstantInSmallSection(const Constant *CN,
+ bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN,
const TargetMachine &TM) const;
- MCSection *getSectionForConstant(SectionKind Kind,
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
const Constant *C) const override;
};
} // end namespace llvm
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 6ce1be707d04..b3222f5d89ef 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -12,6 +12,7 @@
#include "MCTargetDesc/MipsABIFlagsSection.h"
#include "MCTargetDesc/MipsABIInfo.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
@@ -77,8 +78,12 @@ public:
// PIC support
virtual void emitDirectiveCpLoad(unsigned RegNo);
+ virtual void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+ int Offset);
virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
const MCSymbol &Sym, bool IsReg);
+ virtual void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister);
// FP abiflags directives
virtual void emitDirectiveModuleFP();
@@ -97,18 +102,18 @@ public:
// structure values.
template <class PredicateLibrary>
void updateABIInfo(const PredicateLibrary &P) {
- ABI = &P.getABI();
+ ABI = P.getABI();
ABIFlagsSection.setAllFromPredicates(P);
}
MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
const MipsABIInfo &getABI() const {
- assert(ABI && "ABI hasn't been set!");
+ assert(ABI.hasValue() && "ABI hasn't been set!");
return *ABI;
}
protected:
- const MipsABIInfo *ABI;
+ llvm::Optional<MipsABIInfo> ABI;
MipsABIFlagsSection ABIFlagsSection;
bool GPRInfoSet;
@@ -188,8 +193,12 @@ public:
// PIC support
void emitDirectiveCpLoad(unsigned RegNo) override;
+ void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+ int Offset) override;
void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
const MCSymbol &Sym, bool IsReg) override;
+ void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) override;
// FP abiflags directives
void emitDirectiveModuleFP() override;
@@ -237,8 +246,12 @@ public:
// PIC support
void emitDirectiveCpLoad(unsigned RegNo) override;
+ void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts,
+ int Offset) override;
void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
const MCSymbol &Sym, bool IsReg) override;
+ void emitDirectiveCpreturn(unsigned SaveLocation,
+ bool SaveLocationIsRegister) override;
void emitMipsAbiFlags();
};
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index 02c5a210d099..f0f223aa057b 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -15,11 +15,9 @@
#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
namespace llvm {
-class MCOperand;
class MCSubtargetInfo;
class NVPTXInstPrinter : public MCInstPrinter {
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index b432e065c2f4..9ac3c8850f75 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -22,6 +22,7 @@ class Triple;
class NVPTXMCAsmInfo : public MCAsmInfo {
virtual void anchor();
+
public:
explicit NVPTXMCAsmInfo(const Triple &TheTriple);
};
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index fe28214e9588..e5fae85bacf2 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -41,24 +41,6 @@ enum CondCodes {
};
}
-inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
- switch (CC) {
- case NVPTXCC::NE:
- return "ne";
- case NVPTXCC::EQ:
- return "eq";
- case NVPTXCC::LT:
- return "lt";
- case NVPTXCC::LE:
- return "le";
- case NVPTXCC::GT:
- return "gt";
- case NVPTXCC::GE:
- return "ge";
- }
- llvm_unreachable("Unknown condition code");
-}
-
FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
llvm::CodeGenOpt::Level OptLevel);
ModulePass *createNVPTXAssignValidGlobalNamesPass();
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ecb0f0a1d0a1..e8c36089a779 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -355,7 +355,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
if (isABI) {
if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
unsigned size = 0;
- if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+ if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
size = ITy->getBitWidth();
if (size < 32)
size = 32;
@@ -635,9 +635,7 @@ static bool usedInGlobalVarDef(const Constant *C) {
return false;
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
- if (GV->getName() == "llvm.used")
- return false;
- return true;
+ return GV->getName() != "llvm.used";
}
for (const User *U : C->users())
@@ -682,7 +680,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
if (!gv->hasInternalLinkage())
return false;
- const PointerType *Pty = gv->getType();
+ PointerType *Pty = gv->getType();
if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
return false;
@@ -720,7 +718,7 @@ static bool useFuncSeen(const Constant *C,
void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
llvm::DenseMap<const Function *, bool> seenMap;
for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
- const Function *F = FI;
+ const Function *F = &*FI;
if (F->isDeclaration()) {
if (F->use_empty())
@@ -870,9 +868,8 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
DenseSet<const GlobalVariable *> GVVisiting;
// Visit each global variable, in order
- for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I)
- VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+ for (const GlobalVariable &I : M.globals())
+ VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting);
assert(GVVisited.size() == M.getGlobalList().size() &&
"Missed a global variable");
@@ -1029,10 +1026,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
GVar->getName().startswith("nvvm."))
return;
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// GlobalVariables are always constant pointers themselves.
- const PointerType *PTy = GVar->getType();
+ PointerType *PTy = GVar->getType();
Type *ETy = PTy->getElementType();
if (GVar->hasExternalLinkage()) {
@@ -1159,7 +1156,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
}
if (GVar->getAlignment() == 0)
- O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+ O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
else
O << " .align " << GVar->getAlignment();
@@ -1185,9 +1182,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
printScalarConstant(Initializer, O);
}
} else {
- // The frontend adds zero-initializer to variables that don't have an
- // initial value, so skip warning for this case.
- if (!GVar->getInitializer()->isNullValue()) {
+ // The frontend adds zero-initializer to device and constant variables
+ // that don't have an initial value, and UndefValue to shared
+ // variables, so skip warning for this case.
+ if (!GVar->getInitializer()->isNullValue() &&
+ !isa<UndefValue>(GVar->getInitializer())) {
report_fatal_error("initial value of '" + GVar->getName() +
"' is not allowed in addrspace(" +
Twine(PTy->getAddressSpace()) + ")");
@@ -1205,7 +1204,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
case Type::StructTyID:
case Type::ArrayTyID:
case Type::VectorTyID:
- ElementSize = TD->getTypeStoreSize(ETy);
+ ElementSize = DL.getTypeStoreSize(ETy);
// Ptx allows variable initilization only for constant and
// global state spaces.
if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
@@ -1299,7 +1298,7 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
}
std::string
-NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
switch (Ty->getTypeID()) {
default:
llvm_unreachable("unexpected type");
@@ -1339,16 +1338,16 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
raw_ostream &O) {
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// GlobalVariables are always constant pointers themselves.
- const PointerType *PTy = GVar->getType();
+ PointerType *PTy = GVar->getType();
Type *ETy = PTy->getElementType();
O << ".";
emitPTXAddressSpace(PTy->getAddressSpace(), O);
if (GVar->getAlignment() == 0)
- O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+ O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
else
O << " .align " << GVar->getAlignment();
@@ -1370,7 +1369,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
case Type::StructTyID:
case Type::ArrayTyID:
case Type::VectorTyID:
- ElementSize = TD->getTypeStoreSize(ETy);
+ ElementSize = DL.getTypeStoreSize(ETy);
O << " .b8 ";
getSymbol(GVar)->print(O, MAI);
O << "[";
@@ -1385,32 +1384,32 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
return;
}
-static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
+static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
if (Ty->isSingleValueType())
- return TD->getPrefTypeAlignment(Ty);
+ return DL.getPrefTypeAlignment(Ty);
- const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+ auto *ATy = dyn_cast<ArrayType>(Ty);
if (ATy)
- return getOpenCLAlignment(TD, ATy->getElementType());
+ return getOpenCLAlignment(DL, ATy->getElementType());
- const StructType *STy = dyn_cast<StructType>(Ty);
+ auto *STy = dyn_cast<StructType>(Ty);
if (STy) {
unsigned int alignStruct = 1;
// Go through each element of the struct and find the
// largest alignment.
for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
Type *ETy = STy->getElementType(i);
- unsigned int align = getOpenCLAlignment(TD, ETy);
+ unsigned int align = getOpenCLAlignment(DL, ETy);
if (align > alignStruct)
alignStruct = align;
}
return alignStruct;
}
- const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+ auto *FTy = dyn_cast<FunctionType>(Ty);
if (FTy)
- return TD->getPointerPrefAlignment();
- return TD->getPrefTypeAlignment(Ty);
+ return DL.getPointerPrefAlignment();
+ return DL.getPrefTypeAlignment(Ty);
}
void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
@@ -1419,13 +1418,8 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
O << "_param_" << paramIndex;
}
-void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
- CurrentFnSym->print(O, MAI);
- O << "_param_" << paramIndex;
-}
-
void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
const AttributeSet &PAL = F->getAttributes();
const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
Function::const_arg_iterator I, E;
@@ -1433,7 +1427,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
bool first = true;
bool isKernelFunc = llvm::isKernelFunction(*F);
bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
- MVT thePointerTy = TLI->getPointerTy(*TD);
+ MVT thePointerTy = TLI->getPointerTy(DL);
O << "(\n";
@@ -1485,9 +1479,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// size = typeallocsize of element type
unsigned align = PAL.getParamAlignment(paramIndex + 1);
if (align == 0)
- align = TD->getABITypeAlignment(Ty);
+ align = DL.getABITypeAlignment(Ty);
- unsigned sz = TD->getTypeAllocSize(Ty);
+ unsigned sz = DL.getTypeAllocSize(Ty);
O << "\t.param .align " << align << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
@@ -1495,7 +1489,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
continue;
}
// Just a scalar
- const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ auto *PTy = dyn_cast<PointerType>(Ty);
if (isKernelFunc) {
if (PTy) {
// Special handling for pointer arguments to kernel
@@ -1519,7 +1513,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
O << ".ptr .global ";
break;
}
- O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " ";
+ O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " ";
}
printParamName(I, paramIndex, O);
continue;
@@ -1556,7 +1550,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
}
// param has byVal attribute. So should be a pointer
- const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ auto *PTy = dyn_cast<PointerType>(Ty);
assert(PTy && "Param with byval attribute should be a pointer type");
Type *ETy = PTy->getElementType();
@@ -1566,9 +1560,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// size = typeallocsize of element type
unsigned align = PAL.getParamAlignment(paramIndex + 1);
if (align == 0)
- align = TD->getABITypeAlignment(ETy);
+ align = DL.getABITypeAlignment(ETy);
- unsigned sz = TD->getTypeAllocSize(ETy);
+ unsigned sz = DL.getTypeAllocSize(ETy);
O << "\t.param .align " << align << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
@@ -1579,7 +1573,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// Further, if a part is vector, print the above for
// each vector element.
SmallVector<EVT, 16> vtparts;
- ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts);
+ ComputeValueVTs(*TLI, DL, ETy, vtparts);
for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
unsigned elems = 1;
EVT elemtype = vtparts[i];
@@ -1786,10 +1780,10 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) {
void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
AggBuffer *aggBuffer) {
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
- int s = TD->getTypeAllocSize(CPV->getType());
+ int s = DL.getTypeAllocSize(CPV->getType());
if (s < Bytes)
s = Bytes;
aggBuffer->addZeros(s);
@@ -1800,7 +1794,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
switch (CPV->getType()->getTypeID()) {
case Type::IntegerTyID: {
- const Type *ETy = CPV->getType();
+ Type *ETy = CPV->getType();
if (ETy == Type::getInt8Ty(CPV->getContext())) {
unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
ConvertIntToBytes<>(ptr, c);
@@ -1817,7 +1811,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
break;
} else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
- ConstantFoldConstantExpression(Cexpr, *TD))) {
+ ConstantFoldConstantExpression(Cexpr, DL))) {
int int32 = (int)(constInt->getZExtValue());
ConvertIntToBytes<>(ptr, int32);
aggBuffer->addBytes(ptr, 4, Bytes);
@@ -1839,7 +1833,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
break;
} else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
- ConstantFoldConstantExpression(Cexpr, *TD))) {
+ ConstantFoldConstantExpression(Cexpr, DL))) {
long long int64 = (long long)(constInt->getZExtValue());
ConvertIntToBytes<>(ptr, int64);
aggBuffer->addBytes(ptr, 8, Bytes);
@@ -1860,7 +1854,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
case Type::FloatTyID:
case Type::DoubleTyID: {
const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
- const Type *Ty = CFP->getType();
+ Type *Ty = CFP->getType();
if (Ty == Type::getFloatTy(CPV->getContext())) {
float float32 = (float) CFP->getValueAPF().convertToFloat();
ConvertFloatToBytes(ptr, float32);
@@ -1881,7 +1875,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
const Value *v = Cexpr->stripPointerCasts();
aggBuffer->addSymbol(v, Cexpr);
}
- unsigned int s = TD->getTypeAllocSize(CPV->getType());
+ unsigned int s = DL.getTypeAllocSize(CPV->getType());
aggBuffer->addZeros(s);
break;
}
@@ -1891,7 +1885,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
case Type::StructTyID: {
if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
- int ElementSize = TD->getTypeAllocSize(CPV->getType());
+ int ElementSize = DL.getTypeAllocSize(CPV->getType());
bufferAggregateConstant(CPV, aggBuffer);
if (Bytes > ElementSize)
aggBuffer->addZeros(Bytes - ElementSize);
@@ -1909,7 +1903,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
AggBuffer *aggBuffer) {
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
int Bytes;
// Old constants
@@ -1934,12 +1928,12 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
StructType *ST = cast<StructType>(CPV->getType());
for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
if (i == (e - 1))
- Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
- TD->getTypeAllocSize(ST) -
- TD->getStructLayout(ST)->getElementOffset(i);
+ Bytes = DL.getStructLayout(ST)->getElementOffset(0) +
+ DL.getTypeAllocSize(ST) -
+ DL.getStructLayout(ST)->getElementOffset(i);
else
- Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) -
- TD->getStructLayout(ST)->getElementOffset(i);
+ Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) -
+ DL.getStructLayout(ST)->getElementOffset(i);
bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
}
}
@@ -1951,18 +1945,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
// buildTypeNameMap - Run through symbol table looking for type names.
//
-bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
-
- std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
-
- if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") ||
- !PI->second.compare("struct._image2d_t") ||
- !PI->second.compare("struct._image3d_t")))
- return true;
-
- return false;
-}
-
bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
switch (MI.getOpcode()) {
@@ -2054,7 +2036,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
// If the code isn't optimized, there may be outstanding folding
// opportunities. Attempt to fold the expression using DataLayout as a
// last resort before giving up.
- if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout()))
+ if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout()))
if (C != CE)
return lowerConstantForGV(C, ProcessingGeneric);
@@ -2083,7 +2065,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
}
case Instruction::GetElementPtr: {
- const DataLayout &DL = *TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// Generate a symbolic expression for the byte address
APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
@@ -2109,7 +2091,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
case Instruction::IntToPtr: {
- const DataLayout &DL = *TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// Handle casts to pointers by changing them into casts to the appropriate
// integer type. This promotes constant folding and simplifies this code.
@@ -2120,7 +2102,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
}
case Instruction::PtrToInt: {
- const DataLayout &DL = *TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
// Support only foldable casts to/from pointers that can be eliminated by
// changing the pointer to the appropriately sized integer type.
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index f6f7685e76f9..76bf179896a8 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -212,28 +212,21 @@ private:
MCOperand GetSymbolRef(const MCSymbol *Symbol);
unsigned encodeVirtualRegister(unsigned Reg);
- void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
-
void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
raw_ostream &O);
void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
const char *Modifier = nullptr);
- void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
bool = false);
- void printParamName(int paramIndex, raw_ostream &O);
void printParamName(Function::const_arg_iterator I, int paramIndex,
raw_ostream &O);
void emitGlobals(const Module &M);
void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
void emitVirtualRegister(unsigned int vr, raw_ostream &);
- void emitFunctionExternParamList(const MachineFunction &MF);
void emitFunctionParamList(const Function *, raw_ostream &O);
void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
- void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
- bool isImageType(const Type *Ty);
void printReturnValStr(const Function *, raw_ostream &O);
void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -271,7 +264,7 @@ private:
// Build the map between type name and ID based on module's type
// symbol table.
- std::map<const Type *, std::string> TypeNameMap;
+ std::map<Type *, std::string> TypeNameMap;
// List of variables demoted to a function scope.
std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
@@ -282,19 +275,15 @@ private:
void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
- std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
+ std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
void printScalarConstant(const Constant *CPV, raw_ostream &O);
void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
- void printOperandProper(const MachineOperand &MO);
-
void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
void emitDeclarations(const Module &, raw_ostream &O);
void emitDeclaration(const Function *, raw_ostream &O);
-
- static const char *getRegisterName(unsigned RegNo);
void emitDemotedVars(const Function *, raw_ostream &);
bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index 69a229e32f43..95813c8430d1 100644
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -98,7 +98,7 @@ private:
/// This reordering exposes to optimizeMemoryInstruction more
/// optimization opportunities on loads and stores.
///
- /// If this function succesfully hoists an eliminable addrspacecast or V is
+ /// If this function successfully hoists an eliminable addrspacecast or V is
/// already such an addrspacecast, it returns the transformed value (which is
/// guaranteed to be an addrspacecast); otherwise, it returns nullptr.
Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0);
@@ -267,14 +267,14 @@ bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
return false;
bool Changed = false;
- for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
- for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+ for (BasicBlock &B : F) {
+ for (Instruction &I : B) {
if (isa<LoadInst>(I)) {
// V = load P
- Changed |= optimizeMemoryInstruction(I, 0);
+ Changed |= optimizeMemoryInstruction(&I, 0);
} else if (isa<StoreInst>(I)) {
// store V, P
- Changed |= optimizeMemoryInstruction(I, 1);
+ Changed |= optimizeMemoryInstruction(&I, 1);
}
}
}
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 6fd09c405260..62ca5e9f9f62 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -81,7 +81,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
for (Module::global_iterator I = M.global_begin(), E = M.global_end();
I != E;) {
- GlobalVariable *GV = I++;
+ GlobalVariable *GV = &*I++;
if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
!llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
!llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
@@ -117,7 +117,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
Value *Operand = II->getOperand(i);
if (isa<Constant>(Operand)) {
II->setOperand(
- i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+ i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder));
}
}
}
@@ -132,10 +132,8 @@ bool GenericToNVVM::runOnModule(Module &M) {
// Walk through the metadata section and update the debug information
// associated with the global variables in the default address space.
- for (Module::named_metadata_iterator I = M.named_metadata_begin(),
- E = M.named_metadata_end();
- I != E; I++) {
- remapNamedMDNode(VM, I);
+ for (NamedMDNode &I : M.named_metadata()) {
+ remapNamedMDNode(VM, &I);
}
// Walk through the global variable initializers, and replace any use of
@@ -318,9 +316,8 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
NewOperands[0], NewOperands[1]);
case Instruction::FCmp:
// CompareConstantExpr (fcmp)
- assert(false && "Address space conversion should have no effect "
- "on float point CompareConstantExpr (fcmp)!");
- return C;
+ llvm_unreachable("Address space conversion should have no effect "
+ "on float point CompareConstantExpr (fcmp)!");
case Instruction::ExtractElement:
// ExtractElementConstantExpr
return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
@@ -364,8 +361,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
NewOperands[0], C->getType());
}
- assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
- return C;
+ llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr");
}
}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 232a611d1760..2d0098b392f4 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -12,6 +12,8 @@
//===----------------------------------------------------------------------===//
#include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h"
@@ -530,7 +532,7 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
if (!Src)
return NVPTX::PTXLdStInstCode::GENERIC;
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+ if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
switch (PT->getAddressSpace()) {
case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
@@ -544,6 +546,39 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
+static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
+ unsigned CodeAddrSpace, MachineFunction *F) {
+ // To use non-coherent caching, the load has to be from global
+ // memory and we have to prove that the memory area is not written
+ // to anywhere for the duration of the kernel call, not even after
+ // the load.
+ //
+ // To ensure that there are no writes to the memory, we require the
+ // underlying pointer to be a noalias (__restrict) kernel parameter
+ // that is never used for a write. We can only do this for kernel
+ // functions since from within a device function, we cannot know if
+ // there were or will be writes to the memory from the caller - or we
+ // could, but then we would have to do inter-procedural analysis.
+ if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
+ !isKernelFunction(*F->getFunction())) {
+ return false;
+ }
+
+ // We use GetUnderlyingObjects() here instead of
+ // GetUnderlyingObject() mainly because the former looks through phi
+ // nodes while the latter does not. We need to look through phi
+ // nodes to handle pointer induction variables.
+ SmallVector<Value *, 8> Objs;
+ GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+ Objs, F->getDataLayout());
+ for (Value *Obj : Objs) {
+ auto *A = dyn_cast<const Argument>(Obj);
+ if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+ }
+
+ return true;
+}
+
SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IID) {
@@ -638,6 +673,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
// Address Space Setting
unsigned int codeAddrSpace = getCodeAddrSpace(LD);
+ if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+ return SelectLDGLDU(N);
+ }
+
// Volatile Setting
// - .volatile is only availalble for .global and .shared
bool isVolatile = LD->isVolatile();
@@ -872,6 +911,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
+ if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+ return SelectLDGLDU(N);
+ }
+
// Volatile Setting
// - .volatile is only availalble for .global and .shared
bool IsVolatile = MemSD->isVolatile();
@@ -1425,6 +1468,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1474,6 +1518,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
}
break;
+ case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1522,6 +1567,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break;
}
break;
+ case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1563,6 +1609,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1612,6 +1659,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
}
break;
+ case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1660,6 +1708,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break;
}
break;
+ case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1707,6 +1756,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1756,6 +1806,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
}
break;
+ case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1804,6 +1855,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break;
}
break;
+ case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1845,6 +1897,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) {
default:
return nullptr;
+ case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) {
@@ -1894,6 +1947,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
}
}
break;
+ case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -1942,6 +1996,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break;
}
break;
+ case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) {
default:
@@ -5039,7 +5094,7 @@ bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
}
if (!Src)
return false;
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (auto *PT = dyn_cast<PointerType>(Src->getType()))
return (PT->getAddressSpace() == spN);
return false;
}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b75cf4040312..766369631e14 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -124,6 +124,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// condition branches.
setJumpIsExpensive(true);
+ // Wide divides are _very_ slow. Try to reduce the width of the divide if
+ // possible.
+ addBypassSlowDiv(64, 32);
+
// By default, use the Source scheduling
if (sched4reg)
setSchedulingPreference(Sched::RegPressure);
@@ -275,6 +279,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SELECT);
// Now deduce the information based on the above mentioned
// actions
@@ -910,7 +915,7 @@ std::string NVPTXTargetLowering::getPrototype(
O << "(";
if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
unsigned size = 0;
- if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+ if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
size = ITy->getBitWidth();
if (size < 32)
size = 32;
@@ -981,7 +986,7 @@ std::string NVPTXTargetLowering::getPrototype(
O << "_";
continue;
}
- const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ auto *PTy = dyn_cast<PointerType>(Ty);
assert(PTy && "Param with byval attribute should be a pointer type");
Type *ETy = PTy->getElementType();
@@ -1318,7 +1323,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// struct or vector
SmallVector<EVT, 16> vtparts;
SmallVector<uint64_t, 16> Offsets;
- const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+ auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
assert(PTy && "Type of a byval parameter should be pointer");
ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
vtparts, &Offsets, 0);
@@ -2007,15 +2012,6 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
-SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
- int idx, EVT v) const {
- std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
- std::stringstream suffix;
- suffix << idx;
- *name += suffix.str();
- return DAG.getTargetExternalSymbol(name->c_str(), v);
-}
-
SDValue
NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
std::string ParamSym;
@@ -2029,10 +2025,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
}
-SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
- return getExtSymb(DAG, ".HLPPARAM", idx);
-}
-
// Check to see if the kernel argument is image*_t or sampler_t
bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
@@ -2040,8 +2032,8 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
"struct._image3d_t",
"struct._sampler_t" };
- const Type *Ty = arg->getType();
- const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ Type *Ty = arg->getType();
+ auto *PTy = dyn_cast<PointerType>(Ty);
if (!PTy)
return false;
@@ -2049,14 +2041,11 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
if (!context)
return false;
- const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+ auto *STy = dyn_cast<StructType>(PTy->getElementType());
const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
- for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
- if (TypeName == specialTypes[i])
- return true;
-
- return false;
+ return std::find(std::begin(specialTypes), std::end(specialTypes),
+ TypeName) != std::end(specialTypes);
}
SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -2082,10 +2071,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
std::vector<Type *> argTypes;
std::vector<const Argument *> theArgs;
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; ++I) {
- theArgs.push_back(I);
- argTypes.push_back(I->getType());
+ for (const Argument &I : F->args()) {
+ theArgs.push_back(&I);
+ argTypes.push_back(I.getType());
}
// argTypes.size() (or theArgs.size()) and Ins.size() need not match.
// Ins.size() will be larger
@@ -2545,20 +2533,6 @@ void NVPTXTargetLowering::LowerAsmOperandForConstraint(
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-// NVPTX suuport vector of legal types of any length in Intrinsics because the
-// NVPTX specific type legalizer
-// will legalize them to the PTX supported length.
-bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
- if (isTypeLegal(VT))
- return true;
- if (VT.isVector()) {
- MVT eVT = VT.getVectorElementType();
- if (isTypeLegal(eVT))
- return true;
- }
- return false;
-}
-
static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
switch (Intrinsic) {
default:
@@ -3747,9 +3721,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
// - [immAddr]
if (AM.BaseGV) {
- if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
- return false;
- return true;
+ return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
}
switch (AM.Scale) {
@@ -3820,11 +3792,6 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
- return 4;
-}
-
//===----------------------------------------------------------------------===//
// NVPTX DAG Combining
//===----------------------------------------------------------------------===//
@@ -4057,6 +4024,67 @@ static SDValue PerformANDCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Currently this detects patterns for integer min and max and
+ // lowers them to PTX-specific intrinsics that enable hardware
+ // support.
+
+ const SDValue Cond = N->getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC) return SDValue();
+
+ const SDValue LHS = Cond.getOperand(0);
+ const SDValue RHS = Cond.getOperand(1);
+ const SDValue True = N->getOperand(1);
+ const SDValue False = N->getOperand(2);
+ if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+ return SDValue();
+
+ const EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
+
+ const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue Larger; // The larger of LHS and RHS when condition is true.
+ switch (CC) {
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Larger = RHS;
+ break;
+
+ case ISD::SETGT:
+ case ISD::SETGE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ Larger = LHS;
+ break;
+
+ default:
+ return SDValue();
+ }
+ const bool IsMax = (Larger == True);
+ const bool IsSigned = ISD::isSignedIntSetCC(CC);
+
+ unsigned IntrinsicId;
+ if (VT == MVT::i32) {
+ if (IsSigned)
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
+ else
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
+ } else {
+ assert(VT == MVT::i64);
+ if (IsSigned)
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
+ else
+ IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
+ }
+
+ SDLoc DL(N);
+ return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
+}
+
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -4113,25 +4141,16 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
APInt Val = CI->getAPIntValue();
if (LHSSign == Unsigned) {
- if (Val.isIntN(OptSize)) {
- return true;
- }
- return false;
+ return Val.isIntN(OptSize);
} else {
- if (Val.isSignedIntN(OptSize)) {
- return true;
- }
- return false;
+ return Val.isSignedIntN(OptSize);
}
} else {
OperandSignedness RHSSign;
if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
return false;
- if (LHSSign != RHSSign)
- return false;
-
- return true;
+ return LHSSign == RHSSign;
}
}
@@ -4247,6 +4266,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSHLCombine(N, DCI, OptLevel);
case ISD::AND:
return PerformANDCombine(N, DCI);
+ case ISD::SELECT:
+ return PerformSELECTCombine(N, DCI);
}
return SDValue();
}
@@ -4509,25 +4530,25 @@ void NVPTXTargetLowering::ReplaceNodeResults(
void NVPTXSection::anchor() {}
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
- delete TextSection;
- delete DataSection;
- delete BSSSection;
- delete ReadOnlySection;
-
- delete StaticCtorSection;
- delete StaticDtorSection;
- delete LSDASection;
- delete EHFrameSection;
- delete DwarfAbbrevSection;
- delete DwarfInfoSection;
- delete DwarfLineSection;
- delete DwarfFrameSection;
- delete DwarfPubTypesSection;
- delete DwarfDebugInlineSection;
- delete DwarfStrSection;
- delete DwarfLocSection;
- delete DwarfARangesSection;
- delete DwarfRangesSection;
+ delete static_cast<NVPTXSection *>(TextSection);
+ delete static_cast<NVPTXSection *>(DataSection);
+ delete static_cast<NVPTXSection *>(BSSSection);
+ delete static_cast<NVPTXSection *>(ReadOnlySection);
+
+ delete static_cast<NVPTXSection *>(StaticCtorSection);
+ delete static_cast<NVPTXSection *>(StaticDtorSection);
+ delete static_cast<NVPTXSection *>(LSDASection);
+ delete static_cast<NVPTXSection *>(EHFrameSection);
+ delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
+ delete static_cast<NVPTXSection *>(DwarfInfoSection);
+ delete static_cast<NVPTXSection *>(DwarfLineSection);
+ delete static_cast<NVPTXSection *>(DwarfFrameSection);
+ delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
+ delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
+ delete static_cast<NVPTXSection *>(DwarfStrSection);
+ delete static_cast<NVPTXSection *>(DwarfLocSection);
+ delete static_cast<NVPTXSection *>(DwarfARangesSection);
+ delete static_cast<NVPTXSection *>(DwarfRangesSection);
}
MCSection *
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index e5c37321a33b..60914c1d09b4 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -441,13 +441,9 @@ public:
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
- SelectionDAG &DAG) const;
const char *getTargetNodeName(unsigned Opcode) const override;
- bool isTypeSupportedInIntrinsic(MVT VT) const;
-
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
unsigned Intrinsic) const override;
@@ -459,8 +455,13 @@ public:
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
- /// getFunctionAlignment - Return the Log2 alignment of this function.
- unsigned getFunctionAlignment(const Function *F) const;
+ bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
+ // Truncating 64-bit to 32-bit is free in SASS.
+ if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+ return false;
+ return SrcTy->getPrimitiveSizeInBits() == 64 &&
+ DstTy->getPrimitiveSizeInBits() == 32;
+ }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
EVT VT) const override {
@@ -515,11 +516,7 @@ public:
private:
const NVPTXSubtarget &STI; // cache the subtarget here
-
- SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
- EVT = MVT::i32) const;
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
- SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 76d6597c6e20..9f3cf4551955 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -37,30 +37,31 @@ void NVPTXInstrInfo::copyPhysReg(
const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
- if (DestRC != SrcRC)
- report_fatal_error("Attempted to created cross-class register copy");
-
- if (DestRC == &NVPTX::Int32RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (DestRC == &NVPTX::Int1RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (DestRC == &NVPTX::Float32RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (DestRC == &NVPTX::Int16RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (DestRC == &NVPTX::Int64RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (DestRC == &NVPTX::Float64RegsRegClass)
- BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else {
+ if (DestRC->getSize() != SrcRC->getSize())
+ report_fatal_error("Copy one register into another with a different width");
+
+ unsigned Op;
+ if (DestRC == &NVPTX::Int1RegsRegClass) {
+ Op = NVPTX::IMOV1rr;
+ } else if (DestRC == &NVPTX::Int16RegsRegClass) {
+ Op = NVPTX::IMOV16rr;
+ } else if (DestRC == &NVPTX::Int32RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr
+ : NVPTX::BITCONVERT_32_F2I);
+ } else if (DestRC == &NVPTX::Int64RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
+ : NVPTX::BITCONVERT_64_F2I);
+ } else if (DestRC == &NVPTX::Float32RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
+ : NVPTX::BITCONVERT_32_I2F);
+ } else if (DestRC == &NVPTX::Float64RegsRegClass) {
+ Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr
+ : NVPTX::BITCONVERT_64_I2F);
+ } else {
llvm_unreachable("Bad register copy");
}
+ BuildMI(MBB, I, DL, get(Op), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
@@ -86,27 +87,6 @@ bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
return false;
}
-bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const {
- switch (MI.getOpcode()) {
- default:
- return false;
- case NVPTX::INT_PTX_SREG_NTID_X:
- case NVPTX::INT_PTX_SREG_NTID_Y:
- case NVPTX::INT_PTX_SREG_NTID_Z:
- case NVPTX::INT_PTX_SREG_TID_X:
- case NVPTX::INT_PTX_SREG_TID_Y:
- case NVPTX::INT_PTX_SREG_TID_Z:
- case NVPTX::INT_PTX_SREG_CTAID_X:
- case NVPTX::INT_PTX_SREG_CTAID_Y:
- case NVPTX::INT_PTX_SREG_CTAID_Z:
- case NVPTX::INT_PTX_SREG_NCTAID_X:
- case NVPTX::INT_PTX_SREG_NCTAID_Y:
- case NVPTX::INT_PTX_SREG_NCTAID_Z:
- case NVPTX::INT_PTX_SREG_WARPSIZE:
- return true;
- }
-}
-
bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
unsigned &AddrSpace) const {
bool isLoad = false;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 179c06887198..3e407223f010 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -56,7 +56,6 @@ public:
unsigned &DestReg) const;
bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
- bool isReadSpecialReg(MachineInstr &MI) const;
virtual bool CanTailMerge(const MachineInstr *MI) const;
// Branch analysis.
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0bf72febc4a0..f770c2acaab5 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -6,6 +6,8 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
+// \file
// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
// the size is large or is not a compile-time constant.
//
@@ -18,19 +20,20 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#define DEBUG_TYPE "nvptx"
using namespace llvm;
namespace {
+
// actual analysis class, which is a functionpass
struct NVPTXLowerAggrCopies : public FunctionPass {
static char ID;
@@ -50,179 +53,299 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
return "Lower aggregate copies/intrinsics into loops";
}
};
-} // namespace
char NVPTXLowerAggrCopies::ID = 0;
-// Lower MemTransferInst or load-store pair to loop
-static void convertTransferToLoop(
- Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
- bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
- Type *indType = len->getType();
+// Lower memcpy to loop.
+void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+ Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+ bool DstIsVolatile, LLVMContext &Context,
+ Function &F) {
+ Type *TypeOfCopyLen = CopyLen->getType();
- BasicBlock *origBB = splitAt->getParent();
- BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
- BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+ BasicBlock *NewBB =
+ ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+ BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
- origBB->getTerminator()->setSuccessor(0, loopBB);
- IRBuilder<> builder(origBB, origBB->getTerminator());
+ OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+ IRBuilder<> Builder(OrigBB->getTerminator());
- // srcAddr and dstAddr are expected to be pointer types,
+ // SrcAddr and DstAddr are expected to be pointer types,
// so no check is made here.
- unsigned srcAS = cast<PointerType>(srcAddr->getType())->getAddressSpace();
- unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace();
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
// Cast pointers to (char *)
- srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
- dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+ SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+ DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
- IRBuilder<> loop(loopBB);
- // The loop index (ind) is a phi node.
- PHINode *ind = loop.CreatePHI(indType, 0);
- // Incoming value for ind is 0
- ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
- // load from srcAddr+ind
+ // load from SrcAddr+LoopIndex
// TODO: we can leverage the align parameter of llvm.memcpy for more efficient
// word-sized loads and stores.
- Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind),
- srcVolatile);
- // store at dstAddr+ind
- loop.CreateStore(val, loop.CreateGEP(loop.getInt8Ty(), dstAddr, ind),
- dstVolatile);
-
- // The value for ind coming from backedge is (ind + 1)
- Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
- ind->addIncoming(newind, loopBB);
-
- loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+ Value *Element =
+ LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+ LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+ SrcIsVolatile);
+ // store at DstAddr+LoopIndex
+ LoopBuilder.CreateStore(Element,
+ LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+ DstAddr, LoopIndex),
+ DstIsVolatile);
+
+ // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+ NewBB);
}
-// Lower MemSetInst to loop
-static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
- Value *len, Value *val, LLVMContext &Context,
- Function &F) {
- BasicBlock *origBB = splitAt->getParent();
- BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
- BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+// unsigned char* d = dst;
+// const unsigned char* s = src;
+// if (s < d) {
+// // copy backwards
+// while (n--) {
+// d[n] = s[n];
+// }
+// } else {
+// // copy forward
+// for (size_t i = 0; i < n; ++i) {
+// d[i] = s[i];
+// }
+// }
+// return dst;
+// }
+void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+ Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+ bool DstIsVolatile, LLVMContext &Context,
+ Function &F) {
+ Type *TypeOfCopyLen = CopyLen->getType();
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+
+ // Create the a comparison of src and dst, based on which we jump to either
+ // the forward-copy part of the function (if src >= dst) or the backwards-copy
+ // part (if src < dst).
+ // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+ // structure. Its block terminators (unconditional branches) are replaced by
+ // the appropriate conditional branches when the loop is built.
+ ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
+ SrcAddr, DstAddr, "compare_src_dst");
+ TerminatorInst *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
+ &ElseTerm);
+
+ // Each part of the function consists of two blocks:
+ // copy_backwards: used to skip the loop when n == 0
+ // copy_backwards_loop: the actual backwards loop BB
+ // copy_forward: used to skip the loop when n == 0
+ // copy_forward_loop: the actual forward loop BB
+ BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+ CopyBackwardsBB->setName("copy_backwards");
+ BasicBlock *CopyForwardBB = ElseTerm->getParent();
+ CopyForwardBB->setName("copy_forward");
+ BasicBlock *ExitBB = ConvertedInst->getParent();
+ ExitBB->setName("memmove_done");
+
+ // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+ // between both backwards and forward copy clauses.
+ ICmpInst *CompareN =
+ new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+ ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+ // Copying backwards.
+ BasicBlock *LoopBB =
+ BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ Value *IndexPtr = LoopBuilder.CreateSub(
+ LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+ Value *Element = LoopBuilder.CreateLoad(
+ LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+ LoopBuilder.CreateStore(Element,
+ LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+ LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+ ExitBB, LoopBB);
+ LoopPhi->addIncoming(IndexPtr, LoopBB);
+ LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+ BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+ ThenTerm->eraseFromParent();
+
+ // Copying forward.
+ BasicBlock *FwdLoopBB =
+ BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
+ IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+ PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+ Value *FwdElement = FwdLoopBuilder.CreateLoad(
+ FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+ FwdLoopBuilder.CreateStore(
+ FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+ Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+ FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+ FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+ ExitBB, FwdLoopBB);
+ FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+ FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+ BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+ ElseTerm->eraseFromParent();
+}
- origBB->getTerminator()->setSuccessor(0, loopBB);
- IRBuilder<> builder(origBB, origBB->getTerminator());
+// Lower memset to loop.
+void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
+ Value *CopyLen, Value *SetValue, LLVMContext &Context,
+ Function &F) {
+ BasicBlock *OrigBB = ConvertedInst->getParent();
+ BasicBlock *NewBB =
+ ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+ BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
- unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace();
+ OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+ IRBuilder<> Builder(OrigBB->getTerminator());
// Cast pointer to the type of value getting stored
- dstAddr =
- builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS));
+ unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+ DstAddr = Builder.CreateBitCast(DstAddr,
+ PointerType::get(SetValue->getType(), dstAS));
- IRBuilder<> loop(loopBB);
- PHINode *ind = loop.CreatePHI(len->getType(), 0);
- ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+ LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
- loop.CreateStore(val, loop.CreateGEP(val->getType(), dstAddr, ind), false);
+ LoopBuilder.CreateStore(
+ SetValue,
+ LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+ false);
- Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
- ind->addIncoming(newind, loopBB);
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
- loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+ NewBB);
}
bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
- SmallVector<LoadInst *, 4> aggrLoads;
- SmallVector<MemTransferInst *, 4> aggrMemcpys;
- SmallVector<MemSetInst *, 4> aggrMemsets;
+ SmallVector<LoadInst *, 4> AggrLoads;
+ SmallVector<MemIntrinsic *, 4> MemCalls;
const DataLayout &DL = F.getParent()->getDataLayout();
LLVMContext &Context = F.getParent()->getContext();
- //
- // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
- //
+ // Collect all aggregate loads and mem* calls.
for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
++II) {
- if (LoadInst *load = dyn_cast<LoadInst>(II)) {
- if (!load->hasOneUse())
+ if (LoadInst *LI = dyn_cast<LoadInst>(II)) {
+ if (!LI->hasOneUse())
continue;
- if (DL.getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+ if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize)
continue;
- User *use = load->user_back();
- if (StoreInst *store = dyn_cast<StoreInst>(use)) {
- if (store->getOperand(0) != load)
+ if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) {
+ if (SI->getOperand(0) != LI)
continue;
- aggrLoads.push_back(load);
- }
- } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {
- Value *len = intr->getLength();
- // If the number of elements being copied is greater
- // than MaxAggrCopySize, lower it to a loop
- if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
- if (len_int->getZExtValue() >= MaxAggrCopySize) {
- aggrMemcpys.push_back(intr);
- }
- } else {
- // turn variable length memcpy/memmov into loop
- aggrMemcpys.push_back(intr);
+ AggrLoads.push_back(LI);
}
- } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
- Value *len = memsetintr->getLength();
- if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
- if (len_int->getZExtValue() >= MaxAggrCopySize) {
- aggrMemsets.push_back(memsetintr);
+ } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+ // Convert intrinsic calls with variable size or with constant size
+ // larger than the MaxAggrCopySize threshold.
+ if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
+ if (LenCI->getZExtValue() >= MaxAggrCopySize) {
+ MemCalls.push_back(IntrCall);
}
} else {
- // turn variable length memset into loop
- aggrMemsets.push_back(memsetintr);
+ MemCalls.push_back(IntrCall);
}
}
}
}
- if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
- (aggrMemsets.size() == 0))
+
+ if (AggrLoads.size() == 0 && MemCalls.size() == 0) {
return false;
+ }
//
// Do the transformation of an aggr load/copy/set to a loop
//
- for (LoadInst *load : aggrLoads) {
- StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
- Value *srcAddr = load->getOperand(0);
- Value *dstAddr = store->getOperand(1);
- unsigned numLoads = DL.getTypeStoreSize(load->getType());
- Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
-
- convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
- store->isVolatile(), Context, F);
-
- store->eraseFromParent();
- load->eraseFromParent();
+ for (LoadInst *LI : AggrLoads) {
+ StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+ Value *SrcAddr = LI->getOperand(0);
+ Value *DstAddr = SI->getOperand(1);
+ unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
+ Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+
+ convertMemCpyToLoop(/* ConvertedInst */ SI,
+ /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+ /* CopyLen */ CopyLen,
+ /* SrcIsVolatile */ LI->isVolatile(),
+ /* DstIsVolatile */ SI->isVolatile(),
+ /* Context */ Context,
+ /* Function F */ F);
+
+ SI->eraseFromParent();
+ LI->eraseFromParent();
}
- for (MemTransferInst *cpy : aggrMemcpys) {
- convertTransferToLoop(/* splitAt */ cpy,
- /* srcAddr */ cpy->getSource(),
- /* dstAddr */ cpy->getDest(),
- /* len */ cpy->getLength(),
- /* srcVolatile */ cpy->isVolatile(),
- /* dstVolatile */ cpy->isVolatile(),
+ // Transform mem* intrinsic calls.
+ for (MemIntrinsic *MemCall : MemCalls) {
+ if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
+ convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
+ /* SrcAddr */ Memcpy->getRawSource(),
+ /* DstAddr */ Memcpy->getRawDest(),
+ /* CopyLen */ Memcpy->getLength(),
+ /* SrcIsVolatile */ Memcpy->isVolatile(),
+ /* DstIsVolatile */ Memcpy->isVolatile(),
/* Context */ Context,
/* Function F */ F);
- cpy->eraseFromParent();
- }
-
- for (MemSetInst *memsetinst : aggrMemsets) {
- Value *len = memsetinst->getLength();
- Value *val = memsetinst->getValue();
- convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
- F);
- memsetinst->eraseFromParent();
+ } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
+ convertMemMoveToLoop(/* ConvertedInst */ Memmove,
+ /* SrcAddr */ Memmove->getRawSource(),
+ /* DstAddr */ Memmove->getRawDest(),
+ /* CopyLen */ Memmove->getLength(),
+ /* SrcIsVolatile */ Memmove->isVolatile(),
+ /* DstIsVolatile */ Memmove->isVolatile(),
+ /* Context */ Context,
+ /* Function F */ F);
+
+ } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
+ convertMemSetToLoop(/* ConvertedInst */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* CopyLen */ Memset->getLength(),
+ /* SetValue */ Memset->getValue(),
+ /* Context */ Context,
+ /* Function F */ F);
+ }
+ MemCall->eraseFromParent();
}
return true;
}
+} // namespace
+
+namespace llvm {
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies",
+ "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+ false, false)
+
FunctionPass *llvm::createLowerAggrCopies() {
return new NVPTXLowerAggrCopies();
}
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 93d0025d8f53..624052e9b981 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -81,7 +81,7 @@ bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
// Check Load, Store, GEP, and BitCast Uses on alloca and make them
// use the converted generic address, in order to expose non-generic
// addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types
- // of instructions this is unecessary and may introduce redudant
+ // of instructions this is unnecessary and may introduce redundant
// address cast.
const auto &AllocaUse = *UI++;
auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
index b533f316d8a9..6656077348a1 100644
--- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -47,6 +47,36 @@
// ...
// }
//
+// 3. Convert pointers in a byval kernel parameter to pointers in the global
+// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+//
+// struct S {
+// int *x;
+// int *y;
+// };
+// __global__ void foo(S s) {
+// int *b = s.y;
+// // use b
+// }
+//
+// "b" points to the global address space. In the IR level,
+//
+// define void @foo({i32*, i32*}* byval %input) {
+// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+// %b = load i32*, i32** %b_ptr
+// ; use %b
+// }
+//
+// becomes
+//
+// define void @foo({i32*, i32*}* byval %input) {
+// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+// %b = load i32*, i32** %b_ptr
+// %b_global = addrspacecast i32* %b to i32 addrspace(1)*
+// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+// ; use %b_generic
+// }
+//
// TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes
// don't cancel the addrspacecast pair this pass emits.
//===----------------------------------------------------------------------===//
@@ -54,6 +84,7 @@
#include "NVPTX.h"
#include "NVPTXUtilities.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -71,9 +102,12 @@ class NVPTXLowerKernelArgs : public FunctionPass {
bool runOnFunction(Function &F) override;
// handle byval parameters
- void handleByValParam(Argument *);
- // handle non-byval pointer parameters
- void handlePointerParam(Argument *);
+ void handleByValParam(Argument *Arg);
+ // Knowing Ptr must point to the global address space, this function
+ // addrspacecasts Ptr to global and then back to generic. This allows
+ // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into
+ // loads/stores that appear later.
+ void markPointerAsGlobal(Value *Ptr);
public:
static char ID; // Pass identification, replacement for typeid
@@ -104,7 +138,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
//
// The above code allocates some space in the stack and copies the incoming
// struct from param space to local space.
-// Then replace all occurences of %d by %temp.
+// Then replace all occurrences of %d by %temp.
// =============================================================================
void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
Function *Func = Arg->getParent();
@@ -128,27 +162,33 @@ void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
new StoreInst(LI, AllocA, FirstInst);
}
-void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) {
- assert(!Arg->hasByValAttr() &&
- "byval params should be handled by handleByValParam");
-
- // Do nothing if the argument already points to the global address space.
- if (Arg->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
+void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
+ if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
return;
- Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin();
- Instruction *ArgInGlobal = new AddrSpaceCastInst(
- Arg, PointerType::get(Arg->getType()->getPointerElementType(),
+ // Deciding where to emit the addrspacecast pair.
+ BasicBlock::iterator InsertPt;
+ if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+ // Insert at the functon entry if Ptr is an argument.
+ InsertPt = Arg->getParent()->getEntryBlock().begin();
+ } else {
+ // Insert right after Ptr if Ptr is an instruction.
+ InsertPt = ++cast<Instruction>(Ptr)->getIterator();
+ assert(InsertPt != InsertPt->getParent()->end() &&
+ "We don't call this function with Ptr being a terminator.");
+ }
+
+ Instruction *PtrInGlobal = new AddrSpaceCastInst(
+ Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
ADDRESS_SPACE_GLOBAL),
- Arg->getName(), FirstInst);
- Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(),
- Arg->getName(), FirstInst);
- // Replace with ArgInGeneric all uses of Args except ArgInGlobal.
- Arg->replaceAllUsesWith(ArgInGeneric);
- ArgInGlobal->setOperand(0, Arg);
+ Ptr->getName(), &*InsertPt);
+ Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
+ Ptr->getName(), &*InsertPt);
+ // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
+ Ptr->replaceAllUsesWith(PtrInGeneric);
+ PtrInGlobal->setOperand(0, Ptr);
}
-
// =============================================================================
// Main function for this pass.
// =============================================================================
@@ -157,12 +197,32 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
if (!isKernelFunction(F))
return false;
+ if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
+ // Mark pointers in byval structs as global.
+ for (auto &B : F) {
+ for (auto &I : B) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (LI->getType()->isPointerTy()) {
+ Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
+ F.getParent()->getDataLayout());
+ if (Argument *Arg = dyn_cast<Argument>(UO)) {
+ if (Arg->hasByValAttr()) {
+ // LI is a load from a pointer within a byval kernel parameter.
+ markPointerAsGlobal(LI);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
for (Argument &Arg : F.args()) {
if (Arg.getType()->isPointerTy()) {
if (Arg.hasByValAttr())
handleByValParam(&Arg);
else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
- handlePointerParam(&Arg);
+ markPointerAsGlobal(&Arg);
}
}
return true;
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 46b4b33e7e40..81a606d7535c 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -68,7 +68,7 @@ public:
return false;
}
void visitUsedExpr(MCStreamer &Streamer) const override {};
- MCSection *findAssociatedSection() const override { return nullptr; }
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
// There are no TLS NVPTXMCExprs at the moment.
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
@@ -110,7 +110,7 @@ public:
return false;
}
void visitUsedExpr(MCStreamer &Streamer) const override {};
- MCSection *findAssociatedSection() const override { return nullptr; }
+ MCFragment *findAssociatedFragment() const override { return nullptr; }
// There are no TLS NVPTXMCExprs at the moment.
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 5fd69a6815a8..17019d7b364d 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -72,7 +72,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
// If last instruction is a return instruction, add an epilogue
- if (!I->empty() && I->back().isReturn())
+ if (I->isReturnBlock())
TFI.emitEpilogue(MF, *I);
}
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index 0d2627d62ebd..45a7309479ee 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -19,15 +19,14 @@
#include <vector>
namespace llvm {
-/// NVPTXSection - Represents a section in PTX
-/// PTX does not have sections. We create this class in order to use
-/// the ASMPrint interface.
+/// Represents a section in PTX PTX does not have sections. We create this class
+/// in order to use the ASMPrint interface.
///
-class NVPTXSection : public MCSection {
+class NVPTXSection final : public MCSection {
virtual void anchor();
public:
NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
- virtual ~NVPTXSection() {}
+ ~NVPTXSection() {}
/// Override this as NVPTX has its own way of printing switching
/// to a section.
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 248f9e117d83..aa931b134da9 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -53,6 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&);
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
void initializeNVPTXLowerAllocaPass(PassRegistry &);
}
@@ -64,14 +65,15 @@ extern "C" void LLVMInitializeNVPTXTarget() {
// FIXME: This pass is really intended to be invoked during IR optimization,
// but it's very NVPTX-specific.
- initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
- initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
- initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry());
- initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
- initializeNVPTXFavorNonGenericAddrSpacesPass(
- *PassRegistry::getPassRegistry());
- initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
- initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeNVVMReflectPass(PR);
+ initializeGenericToNVVMPass(PR);
+ initializeNVPTXAllocaHoistingPass(PR);
+ initializeNVPTXAssignValidGlobalNamesPass(PR);
+ initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
+ initializeNVPTXLowerKernelArgsPass(PR);
+ initializeNVPTXLowerAllocaPass(PR);
+ initializeNVPTXLowerAggrCopiesPass(PR);
}
static std::string computeDataLayout(bool is64Bit) {
@@ -139,6 +141,10 @@ public:
FunctionPass *createTargetRegisterAllocator(bool) override;
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+ // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+ void addEarlyCSEOrGVNPass();
};
} // end anonymous namespace
@@ -148,11 +154,18 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
}
TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(NVPTXTTIImpl(this, F));
});
}
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createGVNPass());
+ else
+ addPass(createEarlyCSEPass());
+}
+
void NVPTXPassConfig::addIRPasses() {
// The following passes are known to not play well with virtual regs hanging
// around after register allocation (which in our case, is *all* registers).
@@ -161,13 +174,14 @@ void NVPTXPassConfig::addIRPasses() {
// NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
disablePass(&PrologEpilogCodeInserterID);
disablePass(&MachineCopyPropagationID);
- disablePass(&BranchFolderPassID);
disablePass(&TailDuplicateID);
+ addPass(createNVVMReflectPass());
addPass(createNVPTXImageOptimizerPass());
- TargetPassConfig::addIRPasses();
addPass(createNVPTXAssignValidGlobalNamesPass());
addPass(createGenericToNVVMPass());
+
+ // === Propagate special address spaces ===
addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
// NVPTXLowerKernelArgs emits alloca for byval parameters which can often
// be eliminated by SROA.
@@ -178,22 +192,38 @@ void NVPTXPassConfig::addIRPasses() {
// them unused. We could remove dead code in an ad-hoc manner, but that
// requires manual work and might be error-prone.
addPass(createDeadCodeEliminationPass());
+
+ // === Straight-line scalar optimizations ===
addPass(createSeparateConstOffsetFromGEPPass());
+ addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
addPass(createStraightLineStrengthReducePass());
// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
// EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
// for some of our benchmarks.
- if (getOptLevel() == CodeGenOpt::Aggressive)
- addPass(createGVNPass());
- else
- addPass(createEarlyCSEPass());
+ addEarlyCSEOrGVNPass();
// Run NaryReassociate after EarlyCSE/GVN to be more effective.
addPass(createNaryReassociatePass());
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
addPass(createEarlyCSEPass());
+
+ // === LSR and other generic IR passes ===
+ TargetPassConfig::addIRPasses();
+ // EarlyCSE is not always strong enough to clean up what LSR produces. For
+ // example, GVN can combine
+ //
+ // %0 = add %a, %b
+ // %1 = add %b, %a
+ //
+ // and
+ //
+ // %0 = shl nsw %a, 2
+ // %1 = shl %a, 2
+ //
+ // but EarlyCSE can do neither of them.
+ addEarlyCSEOrGVNPass();
}
bool NVPTXPassConfig::addInstSelector() {
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 5ecdc8748830..0f88ddfaa934 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -48,8 +48,7 @@ public:
void Initialize(MCContext &ctx, const TargetMachine &TM) override {
TargetLoweringObjectFile::Initialize(ctx, TM);
TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
- DataSection =
- new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
+ DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
ReadOnlySection =
new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
@@ -84,7 +83,7 @@ public:
new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
}
- MCSection *getSectionForConstant(SectionKind Kind,
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
const Constant *C) const override {
return ReadOnlySection;
}
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index e7250cdba5ac..6e679dd0257c 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -89,12 +89,12 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
return false;
}
-unsigned NVPTXTTIImpl::getArithmeticInstrCost(
+int NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 5bcd1e27a558..0946a3293eec 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -52,7 +52,7 @@ public:
bool isSourceOfDivergence(const Value *V);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 1f178af41670..578b466568ae 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -335,106 +335,7 @@ bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
return false;
}
-bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
- if ((id == Intrinsic::nvvm_barrier0) ||
- (id == Intrinsic::nvvm_barrier0_popc) ||
- (id == Intrinsic::nvvm_barrier0_and) ||
- (id == Intrinsic::nvvm_barrier0_or) ||
- (id == Intrinsic::cuda_syncthreads))
- return true;
- return false;
-}
-
-// Interface for checking all memory space transfer related intrinsics
-bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
- if (id == Intrinsic::nvvm_ptr_local_to_gen ||
- id == Intrinsic::nvvm_ptr_shared_to_gen ||
- id == Intrinsic::nvvm_ptr_global_to_gen ||
- id == Intrinsic::nvvm_ptr_constant_to_gen ||
- id == Intrinsic::nvvm_ptr_gen_to_global ||
- id == Intrinsic::nvvm_ptr_gen_to_shared ||
- id == Intrinsic::nvvm_ptr_gen_to_local ||
- id == Intrinsic::nvvm_ptr_gen_to_constant ||
- id == Intrinsic::nvvm_ptr_gen_to_param) {
- return true;
- }
-
- return false;
-}
-
-// consider several special intrinsics in striping pointer casts, and
-// provide an option to ignore GEP indicies for find out the base address only
-// which could be used in simple alias disambigurate.
-const Value *
-llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
- V = V->stripPointerCasts();
- while (true) {
- if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
- if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
- V = IS->getArgOperand(0)->stripPointerCasts();
- continue;
- }
- } else if (ignore_GEP_indices)
- if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
- V = GEP->getPointerOperand()->stripPointerCasts();
- continue;
- }
- break;
- }
- return V;
-}
-
-// consider several special intrinsics in striping pointer casts, and
-// - ignore GEP indicies for find out the base address only, and
-// - tracking PHINode
-// which could be used in simple alias disambigurate.
-const Value *
-llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
- if (processed.find(V) != processed.end())
- return nullptr;
- processed.insert(V);
-
- const Value *V2 = V->stripPointerCasts();
- if (V2 != V && processed.find(V2) != processed.end())
- return nullptr;
- processed.insert(V2);
-
- V = V2;
-
- while (true) {
- if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
- if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
- V = IS->getArgOperand(0)->stripPointerCasts();
- continue;
- }
- } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
- V = GEP->getPointerOperand()->stripPointerCasts();
- continue;
- } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
- if (V != V2 && processed.find(V) != processed.end())
- return nullptr;
- processed.insert(PN);
- const Value *common = nullptr;
- for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
- const Value *pv = PN->getIncomingValue(i);
- const Value *base = skipPointerTransfer(pv, processed);
- if (base) {
- if (!common)
- common = base;
- else if (common != base)
- return PN;
- }
- }
- if (!common)
- return PN;
- V = common;
- }
- break;
- }
- return V;
-}
-
-// The following are some useful utilities for debuggung
+// The following are some useful utilities for debugging
BasicBlock *llvm::getParentBlock(Value *v) {
if (BasicBlock *B = dyn_cast<BasicBlock>(v))
@@ -466,7 +367,7 @@ void llvm::dumpBlock(Value *v, char *blockName) {
return;
for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
- BasicBlock *B = it;
+ BasicBlock *B = &*it;
if (strcmp(B->getName().data(), blockName) == 0) {
B->dump();
return;
@@ -490,7 +391,7 @@ Instruction *llvm::getInst(Value *base, char *instName) {
return nullptr;
}
-// Dump an instruction by nane
+// Dump an instruction by name
void llvm::dumpInst(Value *base, char *instName) {
Instruction *I = getInst(base, instName);
if (I)
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index 7e2ce73daaa3..a5262cb7412f 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -61,27 +61,6 @@ bool isKernelFunction(const llvm::Function &);
bool getAlign(const llvm::Function &, unsigned index, unsigned &);
bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
-bool isBarrierIntrinsic(llvm::Intrinsic::ID);
-
-/// make_vector - Helper function which is useful for building temporary vectors
-/// to pass into type construction of CallInst ctors. This turns a null
-/// terminated list of pointers (or other value types) into a real live vector.
-///
-template <typename T> inline std::vector<T> make_vector(T A, ...) {
- va_list Args;
- va_start(Args, A);
- std::vector<T> Result;
- Result.push_back(A);
- while (T Val = va_arg(Args, T))
- Result.push_back(Val);
- va_end(Args);
- return Result;
-}
-
-bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
-const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
-const Value *
-skipPointerTransfer(const Value *V, std::set<const Value *> &processed);
BasicBlock *getParentBlock(Value *v);
Function *getParentFunction(Value *v);
void dumpBlock(Value *v, char *blockName);
diff --git a/lib/Target/NVPTX/NVPTXVector.td b/lib/Target/NVPTX/NVPTXVector.td
index a237247e4833..e69bbba9f193 100644
--- a/lib/Target/NVPTX/NVPTXVector.td
+++ b/lib/Target/NVPTX/NVPTXVector.td
@@ -26,7 +26,7 @@ let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
(ins V2I16Regs:$src, i8imm:$c),
"mov.u16 \t$dst, $src${c:vecelem};",
- [(set Int16Regs:$dst, (vector_extract
+ [(set Int16Regs:$dst, (extractelt
(v2i16 V2I16Regs:$src), imm:$c))],
IMOV16rr>;
@@ -34,7 +34,7 @@ def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
(ins V4I16Regs:$src, i8imm:$c),
"mov.u16 \t$dst, $src${c:vecelem};",
- [(set Int16Regs:$dst, (vector_extract
+ [(set Int16Regs:$dst, (extractelt
(v4i16 V4I16Regs:$src), imm:$c))],
IMOV16rr>;
@@ -42,7 +42,7 @@ def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
(ins V2I8Regs:$src, i8imm:$c),
"mov.u16 \t$dst, $src${c:vecelem};",
- [(set Int8Regs:$dst, (vector_extract
+ [(set Int8Regs:$dst, (extractelt
(v2i8 V2I8Regs:$src), imm:$c))],
IMOV8rr>;
@@ -50,7 +50,7 @@ def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
(ins V4I8Regs:$src, i8imm:$c),
"mov.u16 \t$dst, $src${c:vecelem};",
- [(set Int8Regs:$dst, (vector_extract
+ [(set Int8Regs:$dst, (extractelt
(v4i8 V4I8Regs:$src), imm:$c))],
IMOV8rr>;
@@ -58,7 +58,7 @@ def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
(ins V2I32Regs:$src, i8imm:$c),
"mov.u32 \t$dst, $src${c:vecelem};",
- [(set Int32Regs:$dst, (vector_extract
+ [(set Int32Regs:$dst, (extractelt
(v2i32 V2I32Regs:$src), imm:$c))],
IMOV32rr>;
@@ -66,7 +66,7 @@ def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
(ins V2F32Regs:$src, i8imm:$c),
"mov.f32 \t$dst, $src${c:vecelem};",
- [(set Float32Regs:$dst, (vector_extract
+ [(set Float32Regs:$dst, (extractelt
(v2f32 V2F32Regs:$src), imm:$c))],
FMOV32rr>;
@@ -74,7 +74,7 @@ def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
(ins V2I64Regs:$src, i8imm:$c),
"mov.u64 \t$dst, $src${c:vecelem};",
- [(set Int64Regs:$dst, (vector_extract
+ [(set Int64Regs:$dst, (extractelt
(v2i64 V2I64Regs:$src), imm:$c))],
IMOV64rr>;
@@ -82,7 +82,7 @@ def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
(ins V2F64Regs:$src, i8imm:$c),
"mov.f64 \t$dst, $src${c:vecelem};",
- [(set Float64Regs:$dst, (vector_extract
+ [(set Float64Regs:$dst, (extractelt
(v2f64 V2F64Regs:$src), imm:$c))],
FMOV64rr>;
@@ -90,7 +90,7 @@ def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
(ins V4I32Regs:$src, i8imm:$c),
"mov.u32 \t$dst, $src${c:vecelem};",
- [(set Int32Regs:$dst, (vector_extract
+ [(set Int32Regs:$dst, (extractelt
(v4i32 V4I32Regs:$src), imm:$c))],
IMOV32rr>;
@@ -98,7 +98,7 @@ def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
(ins V4F32Regs:$src, i8imm:$c),
"mov.f32 \t$dst, $src${c:vecelem};",
- [(set Float32Regs:$dst, (vector_extract
+ [(set Float32Regs:$dst, (extractelt
(v4f32 V4F32Regs:$src), imm:$c))],
FMOV32rr>;
}
@@ -110,8 +110,7 @@ def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
"mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u16 \t$dst${c:vecelem}, $val;",
[(set V2I8Regs:$dst,
- (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
- IMOV8rr>;
+ (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
// Insert v4i8
def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
@@ -119,8 +118,7 @@ def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
"mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u16 \t$dst${c:vecelem}, $val;",
[(set V4I8Regs:$dst,
- (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
- IMOV8rr>;
+ (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
// Insert v2i16
def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
@@ -128,8 +126,8 @@ def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
"mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u16 \t$dst${c:vecelem}, $val;",
[(set V2I16Regs:$dst,
- (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
- IMOV16rr>;
+ (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
// Insert v4i16
def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
@@ -137,8 +135,8 @@ def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
"mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u16 \t$dst${c:vecelem}, $val;",
[(set V4I16Regs:$dst,
- (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
- IMOV16rr>;
+ (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
// Insert v2i32
def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
@@ -146,8 +144,8 @@ def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
"mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u32 \t$dst${c:vecelem}, $val;",
[(set V2I32Regs:$dst,
- (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
- IMOV32rr>;
+ (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
// Insert v2f32
def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
@@ -155,8 +153,8 @@ def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
"mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.f32 \t$dst${c:vecelem}, $val;",
[(set V2F32Regs:$dst,
- (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
- FMOV32rr>;
+ (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
// Insert v2i64
def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
@@ -164,8 +162,8 @@ def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
"mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u64 \t$dst${c:vecelem}, $val;",
[(set V2I64Regs:$dst,
- (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
- IMOV64rr>;
+ (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+ IMOV64rr>;
// Insert v2f64
def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
@@ -173,8 +171,8 @@ def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
"mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.f64 \t$dst${c:vecelem}, $val;",
[(set V2F64Regs:$dst,
- (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
- FMOV64rr>;
+ (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+ FMOV64rr>;
// Insert v4i32
def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
@@ -182,8 +180,8 @@ def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
"mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.u32 \t$dst${c:vecelem}, $val;",
[(set V4I32Regs:$dst,
- (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
- IMOV32rr>;
+ (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
// Insert v4f32
def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
@@ -191,8 +189,8 @@ def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
"mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
"\n\tmov.f32 \t$dst${c:vecelem}, $val;",
[(set V4F32Regs:$dst,
- (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
- FMOV32rr>;
+ (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
}
class BinOpAsmString<string c> {
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 5e375b7852e4..20ab5db584d2 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -109,10 +109,10 @@ void NVVMReflect::setVarMap() {
for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n");
SmallVector<StringRef, 4> NameValList;
- StringRef(ReflectList[i]).split(NameValList, ",");
+ StringRef(ReflectList[i]).split(NameValList, ',');
for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
SmallVector<StringRef, 2> NameValPair;
- NameValList[j].split(NameValPair, "=");
+ NameValList[j].split(NameValPair, '=');
assert(NameValPair.size() == 2 && "name=val expected");
std::stringstream ValStream(NameValPair[1]);
int Val;
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index a699a55d3cbf..220c70a48542 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -243,7 +243,6 @@ namespace {
struct PPCOperand;
class PPCAsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
const MCInstrInfo &MII;
bool IsPPC64;
bool IsDarwin;
@@ -291,9 +290,9 @@ class PPCAsmParser : public MCTargetAsmParser {
public:
- PPCAsmParser(MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII,
- const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(STI), MII(MII) {
+ PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
+ const MCInstrInfo &MII, const MCTargetOptions &Options)
+ : MCTargetAsmParser(Options, STI), MII(MII) {
// Check for 64-bit vs. 32-bit pointer mode.
Triple TheTriple(STI.getTargetTriple());
IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -1185,7 +1184,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
break;
}
case PPC::MFTB: {
- if (STI.getFeatureBits()[PPC::FeatureMFTB]) {
+ if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) {
assert(Inst.getNumOperands() == 2 && "Expecting two operands");
Inst.setOpcode(PPC::MFSPR);
}
@@ -1205,7 +1204,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// Post-process instructions (typically extended mnemonics)
ProcessInstruction(Inst, Operands);
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
@@ -1690,7 +1689,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// where th can be omitted when it is 0. dcbtst is the same. We take the
// server form to be the default, so swap the operands if we're parsing for
// an embedded core (they'll be swapped again upon printing).
- if (STI.getFeatureBits()[PPC::FeatureBookE] &&
+ if (getSTI().getFeatureBits()[PPC::FeatureBookE] &&
Operands.size() == 4 &&
(Name == "dcbt" || Name == "dcbtst")) {
std::swap(Operands[1], Operands[3]);
@@ -1730,10 +1729,19 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
+ SMLoc ExprLoc = getLexer().getLoc();
if (getParser().parseExpression(Value))
return false;
- getParser().getStreamer().EmitValue(Value, Size);
+ if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else {
+ getStreamer().EmitValue(Value, Size, ExprLoc);
+ }
if (getLexer().is(AsmToken::EndOfStatement))
break;
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index c0c83cc258b8..c31ababafbe7 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(PowerPCCommonTableGen)
add_llvm_target(PowerPCCodeGen
+ PPCBoolRetToInt.cpp
PPCAsmPrinter.cpp
PPCBranchSelector.cpp
PPCCTRLoops.cpp
@@ -27,6 +28,7 @@ add_llvm_target(PowerPCCodeGen
PPCLoopPreIncPrep.cpp
PPCMCInstLower.cpp
PPCMachineFunctionInfo.cpp
+ PPCMIPeephole.cpp
PPCRegisterInfo.cpp
PPCSubtarget.cpp
PPCTargetMachine.cpp
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 93a503c3758d..1fc84fb76551 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -401,8 +401,6 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
if (result != MCDisassembler::Fail)
return result;
-
- MI.clear();
}
return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 8e1878344302..53eb727d0b07 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -18,8 +18,6 @@
namespace llvm {
-class MCOperand;
-
class PPCInstPrinter : public MCInstPrinter {
bool IsDarwin;
public:
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 992be5b966c1..dd994956870f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -113,6 +113,10 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
break;
}
break;
+ case PPC::fixup_ppc_half16ds:
+ Target.print(errs());
+ errs() << '\n';
+ report_fatal_error("Invalid PC-relative half16ds relocation");
case FK_Data_4:
case FK_PCRel_4:
Type = ELF::R_PPC_REL32;
@@ -305,13 +309,13 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
break;
case MCSymbolRefExpr::VK_GOT:
Type = ELF::R_PPC64_GOT16_DS;
- break;
+ break;
case MCSymbolRefExpr::VK_PPC_GOT_LO:
Type = ELF::R_PPC64_GOT16_LO_DS;
break;
case MCSymbolRefExpr::VK_PPC_TOC:
Type = ELF::R_PPC64_TOC16_DS;
- break;
+ break;
case MCSymbolRefExpr::VK_PPC_TOC_LO:
Type = ELF::R_PPC64_TOC16_LO_DS;
break;
@@ -372,16 +376,16 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
break;
case MCSymbolRefExpr::VK_None:
Type = ELF::R_PPC64_ADDR64;
- break;
+ break;
case MCSymbolRefExpr::VK_PPC_DTPMOD:
Type = ELF::R_PPC64_DTPMOD64;
- break;
+ break;
case MCSymbolRefExpr::VK_PPC_TPREL:
Type = ELF::R_PPC64_TPREL64;
- break;
+ break;
case MCSymbolRefExpr::VK_PPC_DTPREL:
Type = ELF::R_PPC64_DTPREL64;
- break;
+ break;
}
break;
case FK_Data_4:
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 86ad3859b72c..e252ac944d40 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -20,18 +20,19 @@
namespace llvm {
class Triple;
- class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
- virtual void anchor();
-
- public:
- explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
- };
-
- class PPCELFMCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit PPCELFMCAsmInfo(bool is64Bit, const Triple&);
- };
+class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &);
+};
+
+class PPCELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+};
} // namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index a641780516b3..d42a111cc43e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -82,8 +82,8 @@ public:
const MCAsmLayout *Layout,
const MCFixup *Fixup) const override;
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override {
- return getSubExpr()->findAssociatedSection();
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
}
// There are no TLS PPCMCExprs at the moment.
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 9d7289658f0f..b54a0e1b86b1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -241,12 +241,12 @@ bool PPCMachObjectWriter::recordScatteredRelocation(
if (FixupOffset > 0xffffff) {
char Buffer[32];
format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ Asm.getContext().reportError(Fixup.getLoc(),
Twine("Section too large, can't encode "
"r_address (") +
Buffer + ") into 24 bits of scattered "
"relocation entry.");
- llvm_unreachable("fatal error returned?!");
+ return false;
}
// Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 6075631a541f..acea600fbb0d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -56,6 +56,14 @@ namespace PPC {
PRED_BIT_UNSET = 1025
};
+ // Bit for branch taken (plus) or not-taken (minus) hint
+ enum BranchHintBit {
+ BR_NO_HINT = 0x0,
+ BR_NONTAKEN_HINT = 0x2,
+ BR_TAKEN_HINT = 0x3,
+ BR_HINT_MASK = 0X3
+ };
+
/// Invert the specified predicate. != -> ==, < -> >=.
Predicate InvertPredicate(Predicate Opcode);
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index ae8d8b4f5dfe..a259ed3fd327 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -41,13 +41,16 @@ namespace llvm {
FunctionPass *createPPCVSXCopyPass();
FunctionPass *createPPCVSXFMAMutatePass();
FunctionPass *createPPCVSXSwapRemovalPass();
+ FunctionPass *createPPCMIPeepholePass();
FunctionPass *createPPCBranchSelectionPass();
FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
FunctionPass *createPPCTLSDynamicCallPass();
+ FunctionPass *createPPCBoolRetToIntPass();
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP, bool isDarwin);
void initializePPCVSXFMAMutatePass(PassRegistry&);
+ void initializePPCBoolRetToIntPass(PassRegistry&);
extern char &PPCVSXFMAMutateID;
namespace PPCII {
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 641b2377de40..b03be12cfd97 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -50,6 +50,8 @@ def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true",
"Enable 64-bit instructions">;
+def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+ "Use software emulation for floating point">;
def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
"Enable 64-bit registers usage for ppc32 [beta]">;
def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true",
@@ -137,6 +139,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
"Enable Hardware Transactional Memory instructions">;
def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true",
"Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+ "Target supports add/load integer fusion.">;
+def FeatureFloat128 :
+ SubtargetFeature<"float128", "HasFloat128", "true",
+ "Enable the __float128 data type for IEEE-754R Binary128.",
+ [FeatureVSX]>;
def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true",
"Treat vector data stream cache control instructions as deprecated">;
@@ -168,7 +176,8 @@ def ProcessorFeatures {
FeatureMFTB, DeprecatedDST];
list<SubtargetFeature> Power8SpecificFeatures =
[DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
- FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic];
+ FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
+ FeatureFusion];
list<SubtargetFeature> Power8FeatureList =
!listconcat(Power7FeatureList, Power8SpecificFeatures);
}
@@ -309,7 +318,7 @@ def : ProcessorModel<"g5", G5Model,
Feature64Bit /*, Feature64BitRegs */,
FeatureMFTB, DeprecatedDST]>;
def : ProcessorModel<"e500mc", PPCE500mcModel,
- [DirectiveE500mc, FeatureMFOCRF,
+ [DirectiveE500mc,
FeatureSTFIWX, FeatureICBT, FeatureBookE,
FeatureISEL, FeatureMFTB]>;
def : ProcessorModel<"e5500", PPCE5500Model,
@@ -403,6 +412,7 @@ def PPCAsmParserVariant : AsmParserVariant {
// InstAlias definitions use immediate literals. Set RegisterPrefix
// so that those are not misinterpreted as registers.
string RegisterPrefix = "%";
+ string BreakCharacters = ".";
}
def PPC : Target {
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8e118ec27e67..9a63c14b5053 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -65,19 +65,20 @@ using namespace llvm;
#define DEBUG_TYPE "asmprinter"
namespace {
- class PPCAsmPrinter : public AsmPrinter {
- protected:
- MapVector<MCSymbol*, MCSymbol*> TOC;
- const PPCSubtarget *Subtarget;
- StackMaps SM;
- public:
- explicit PPCAsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
-
- const char *getPassName() const override {
- return "PowerPC Assembly Printer";
- }
+class PPCAsmPrinter : public AsmPrinter {
+protected:
+ MapVector<MCSymbol *, MCSymbol *> TOC;
+ const PPCSubtarget *Subtarget;
+ StackMaps SM;
+
+public:
+ explicit PPCAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
+
+ const char *getPassName() const override {
+ return "PowerPC Assembly Printer";
+ }
MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
@@ -94,10 +95,8 @@ namespace {
void EmitEndOfAsmFile(Module &M) override;
- void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
- const MachineInstr &MI);
- void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
- const MachineInstr &MI);
+ void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
+ void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
bool runOnMachineFunction(MachineFunction &MF) override {
Subtarget = &MF.getSubtarget<PPCSubtarget>();
@@ -157,15 +156,15 @@ static const char *stripRegisterPrefix(const char *RegName) {
return RegName + 1;
case 'c': if (RegName[1] == 'r') return RegName + 2;
}
-
+
return RegName;
}
void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
raw_ostream &O) {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
const MachineOperand &MO = MI->getOperand(OpNo);
-
+
switch (MO.getType()) {
case MachineOperand::MO_Register: {
const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
@@ -184,8 +183,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
MO.getMBB()->getSymbol()->print(O, MAI);
return;
case MachineOperand::MO_ConstantPoolIndex:
- O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
- << '_' << MO.getIndex();
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
return;
case MachineOperand::MO_BlockAddress:
GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
@@ -200,19 +199,19 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
!GV->isStrongDefinitionForLinker()) {
if (!GV->hasHiddenVisibility()) {
SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
- MachineModuleInfoImpl::StubValueTy &StubSym =
- MMI->getObjFileInfo<MachineModuleInfoMachO>()
- .getGVStubEntry(SymToPrint);
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+ SymToPrint);
if (!StubSym.getPointer())
StubSym = MachineModuleInfoImpl::
StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
} else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
GV->hasAvailableExternallyLinkage()) {
SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-
- MachineModuleInfoImpl::StubValueTy &StubSym =
- MMI->getObjFileInfo<MachineModuleInfoMachO>().
- getHiddenGVStubEntry(SymToPrint);
+
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+ SymToPrint);
if (!StubSym.getPointer())
StubSym = MachineModuleInfoImpl::
StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
@@ -295,16 +294,16 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
}
case 'U': // Print 'u' for update form.
case 'X': // Print 'x' for indexed form.
- {
- // FIXME: Currently for PowerPC memory operands are always loaded
- // into a register, so we never get an update or indexed form.
- // This is bad even for offset forms, since even if we know we
- // have a value in -16(r1), we will generate a load into r<n>
- // and then load from 0(r<n>). Until that issue is fixed,
- // tolerate 'U' and 'X' but don't output anything.
- assert(MI->getOperand(OpNo).isReg());
- return false;
- }
+ {
+ // FIXME: Currently for PowerPC memory operands are always loaded
+ // into a register, so we never get an update or indexed form.
+ // This is bad even for offset forms, since even if we know we
+ // have a value in -16(r1), we will generate a load into r<n>
+ // and then load from 0(r<n>). Until that issue is fixed,
+ // tolerate 'U' and 'X' but don't output anything.
+ assert(MI->getOperand(OpNo).isReg());
+ return false;
+ }
}
}
@@ -315,7 +314,6 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
return false;
}
-
/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
/// exists for it. If not, create one. Then return a symbol that references
/// the TOC entry.
@@ -330,8 +328,7 @@ void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
SM.serializeToStackMapSection();
}
-void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
- const MachineInstr &MI) {
+void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
unsigned NumNOPBytes = MI.getOperand(1).getImm();
SM.recordStackMap(MI);
@@ -353,13 +350,12 @@ void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
// Emit nops.
for (unsigned i = 0; i < NumNOPBytes; i += 4)
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
}
// Lower a patchpoint of the form:
// [<def>], <id>, <numBytes>, <target>, <numArgs>
-void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
- const MachineInstr &MI) {
+void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
SM.recordPatchPoint(MI);
PatchPointOpers Opers(&MI);
@@ -375,60 +371,59 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 0;
// Materialize the jump address:
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
.addReg(ScratchReg)
.addImm((CallTarget >> 32) & 0xFFFF));
++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm(32).addImm(16));
++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm((CallTarget >> 16) & 0xFFFF));
++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8)
.addReg(ScratchReg)
.addReg(ScratchReg)
.addImm(CallTarget & 0xFFFF));
// Save the current TOC pointer before the remote call.
int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
.addReg(PPC::X2)
.addImm(TOCSaveOffset)
.addReg(PPC::X1));
++EncodedBytes;
-
// If we're on ELFv1, then we need to load the actual function pointer
// from the function descriptor.
if (!Subtarget->isELFv2ABI()) {
- // Load the new TOC pointer and the function address, but not r11
- // (needing this is rare, and loading it here would prevent passing it
- // via a 'nest' parameter.
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ // Load the new TOC pointer and the function address, but not r11
+ // (needing this is rare, and loading it here would prevent passing it
+ // via a 'nest' parameter.
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
.addReg(PPC::X2)
.addImm(8)
.addReg(ScratchReg));
++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
.addReg(ScratchReg)
.addImm(0)
.addReg(ScratchReg));
++EncodedBytes;
}
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8)
.addReg(ScratchReg));
++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8));
++EncodedBytes;
// Restore the TOC pointer after the call.
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
.addReg(PPC::X2)
.addImm(TOCSaveOffset)
.addReg(PPC::X1));
@@ -439,7 +434,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
MCSymbol *MOSymbol = getSymbol(GValue);
const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP)
.addExpr(SymVar));
EncodedBytes += 2;
}
@@ -454,7 +449,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
assert((NumBytes - EncodedBytes) % 4 == 0 &&
"Invalid number of NOP bytes requested!");
for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP));
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
}
/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
@@ -499,16 +494,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
bool isDarwin = TM.getTargetTriple().isOSDarwin();
const Module *M = MF->getFunction()->getParent();
PICLevel::Level PL = M->getPICLevel();
-
+
// Lower multi-instruction pseudo operations.
switch (MI->getOpcode()) {
default: break;
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
case TargetOpcode::STACKMAP:
- return LowerSTACKMAP(*OutStreamer, SM, *MI);
+ return LowerSTACKMAP(SM, *MI);
case TargetOpcode::PATCHPOINT:
- return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+ return LowerPATCHPOINT(SM, *MI);
case PPC::MoveGOTtoLR: {
// Transform %LR = MoveGOTtoLR
@@ -533,17 +528,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::MovePCtoLR:
case PPC::MovePCtoLR8: {
// Transform %LR = MovePCtoLR
- // Into this, where the label is the PIC base:
+ // Into this, where the label is the PIC base:
// bl L1$pb
// L1$pb:
MCSymbol *PICBase = MF->getPICBaseSymbol();
-
+
// Emit the 'bl'.
- EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
- // FIXME: We would like an efficient form for this, so we don't have to do
- // a lot of extra uniquing.
- .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
-
+ EmitToStreamer(*OutStreamer,
+ MCInstBuilder(PPC::BL)
+ // FIXME: We would like an efficient form for this, so we
+ // don't have to do a lot of extra uniquing.
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
// Emit the label.
OutStreamer->EmitLabel(PICBase);
return;
@@ -654,7 +650,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
-
+
case PPC::ADDIStocHA: {
// Transform %Xd = ADDIStocHA %X2, <ga:@sym>
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
@@ -669,28 +665,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MO.isBlockAddress()) &&
"Invalid operand for ADDIStocHA!");
MCSymbol *MOSymbol = nullptr;
- bool IsExternal = false;
- bool IsNonLocalFunction = false;
- bool IsCommon = false;
- bool IsAvailExt = false;
+ bool GlobalToc = false;
if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
MOSymbol = getSymbol(GV);
- IsExternal = GV->isDeclaration();
- IsCommon = GV->hasCommonLinkage();
- IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
- !GV->isStrongDefinitionForLinker();
- IsAvailExt = GV->hasAvailableExternallyLinkage();
- } else if (MO.isCPI())
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
+ } else if (MO.isCPI()) {
MOSymbol = GetCPISymbol(MO.getIndex());
- else if (MO.isJTI())
+ } else if (MO.isJTI()) {
MOSymbol = GetJTISymbol(MO.getIndex());
- else if (MO.isBlockAddress())
+ } else if (MO.isBlockAddress()) {
MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+ }
- if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
- MO.isJTI() || MO.isBlockAddress() ||
+ if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
TM.getCodeModel() == CodeModel::Large)
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
@@ -727,13 +717,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
}
else if (MO.isGlobal()) {
- const GlobalValue *GValue = MO.getGlobal();
- MOSymbol = getSymbol(GValue);
- if (GValue->getType()->getElementType()->isFunctionTy() ||
- GValue->isDeclaration() || GValue->hasCommonLinkage() ||
- GValue->hasAvailableExternallyLinkage() ||
- TM.getCodeModel() == CodeModel::Large)
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ const GlobalValue *GV = MO.getGlobal();
+ MOSymbol = getSymbol(GV);
+ DEBUG(
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+ "LDtocL used on symbol that could be accessed directly is "
+ "invalid. Must match ADDIStocHA."));
+ MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
}
const MCExpr *Exp =
@@ -754,21 +745,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineOperand &MO = MI->getOperand(2);
assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
MCSymbol *MOSymbol = nullptr;
- bool IsExternal = false;
- bool IsNonLocalFunction = false;
if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
+ DEBUG(
+ unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+ assert (
+ !(GVFlags & PPCII::MO_NLP_FLAG) &&
+ "Interposable definitions must use indirect access."));
MOSymbol = getSymbol(GV);
- IsExternal = GV->isDeclaration();
- IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
- !GV->isStrongDefinitionForLinker();
- } else if (MO.isCPI())
+ } else if (MO.isCPI()) {
MOSymbol = GetCPISymbol(MO.getIndex());
-
- if (IsNonLocalFunction || IsExternal ||
- TM.getCodeModel() == CodeModel::Large)
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ }
const MCExpr *Exp =
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
@@ -840,13 +828,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::PPC32GOT: {
- MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
- const MCExpr *SymGotTlsL =
- MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
- OutContext);
- const MCExpr *SymGotTlsHA =
- MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
- OutContext);
+ MCSymbol *GOTSymbol =
+ OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+ const MCExpr *SymGotTlsL = MCSymbolRefExpr::create(
+ GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext);
+ const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create(
+ GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext);
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
.addReg(MI->getOperand(0).getReg())
.addExpr(SymGotTlsL));
@@ -1079,14 +1066,14 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
// linux/ppc32 - Normal entry label.
- if (!Subtarget->isPPC64() &&
- (TM.getRelocationModel() != Reloc::PIC_ ||
+ if (!Subtarget->isPPC64() &&
+ (TM.getRelocationModel() != Reloc::PIC_ ||
MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small))
return AsmPrinter::EmitFunctionEntryLabel();
if (!Subtarget->isPPC64()) {
const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
- if (PPCFI->usesPICBase()) {
+ if (PPCFI->usesPICBase()) {
MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
MCSymbol *PICBase = MF->getPICBaseSymbol();
OutStreamer->EmitLabel(RelocSymbol);
@@ -1130,11 +1117,10 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
OutStreamer->SwitchSection(Current.first, Current.second);
}
-
bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
- bool isPPC64 = TD->getPointerSizeInBits() == 64;
+ bool isPPC64 = DL.getPointerSizeInBits() == 64;
PPCTargetStreamer &TS =
static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
@@ -1293,8 +1279,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
// Prime text sections so they are adjacent. This reduces the likelihood a
// large data or debug section causes a branch to exceed 16M limit.
- const TargetLoweringObjectFileMachO &TLOFMacho =
- static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection());
if (TM.getRelocationModel() == Reloc::PIC_) {
OutStreamer->SwitchSection(
@@ -1325,7 +1311,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
void PPCDarwinAsmPrinter::
EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
- bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+ bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
// Construct a local MCSubtargetInfo and shadow EmitToStreamer here.
// This is because the MachineFunction won't exist (but have not yet been
@@ -1338,8 +1324,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
S.EmitInstruction(Inst, *STI);
};
- const TargetLoweringObjectFileMachO &TLOFMacho =
- static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
// .lazy_symbol_pointer
MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
@@ -1353,12 +1339,12 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
OutStreamer->SwitchSection(StubSection);
EmitAlignment(4);
-
+
MCSymbol *Stub = Stubs[i].first;
MCSymbol *RawSym = Stubs[i].second.getPointer();
MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
-
+
OutStreamer->EmitLabel(Stub);
OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
@@ -1463,20 +1449,19 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4);
}
}
-
+
OutStreamer->AddBlankLine();
}
-
bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
- bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+ bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
// Darwin/PPC always uses mach-o.
- const TargetLoweringObjectFileMachO &TLOFMacho =
- static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+ const TargetLoweringObjectFileMachO &TLOFMacho =
+ static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
MachineModuleInfoMachO &MMIMacho =
- MMI->getObjFileInfo<MachineModuleInfoMachO>();
-
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
if (!Stubs.empty())
EmitFunctionStubs(Stubs);
@@ -1484,27 +1469,27 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
if (MAI->doesSupportExceptionHandling() && MMI) {
// Add the (possibly multiple) personalities to the set of global values.
// Only referenced functions get into the Personalities list.
- const std::vector<const Function*> &Personalities = MMI->getPersonalities();
- for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
- E = Personalities.end(); I != E; ++I) {
- if (*I) {
- MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+ for (const Function *Personality : MMI->getPersonalities()) {
+ if (Personality) {
+ MCSymbol *NLPSym =
+ getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr");
MachineModuleInfoImpl::StubValueTy &StubSym =
- MMIMacho.getGVStubEntry(NLPSym);
- StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
+ MMIMacho.getGVStubEntry(NLPSym);
+ StubSym =
+ MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true);
}
}
}
// Output stubs for dynamically-linked functions.
Stubs = MMIMacho.GetGVStubList();
-
+
// Output macho stubs for external and common global variables.
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
EmitAlignment(isPPC64 ? 3 : 2);
-
+
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
// L_foo$stub:
OutStreamer->EmitLabel(Stubs[i].first);
@@ -1535,7 +1520,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
if (!Stubs.empty()) {
OutStreamer->SwitchSection(getObjFileLowering().getDataSection());
EmitAlignment(isPPC64 ? 3 : 2);
-
+
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
// L_foo$stub:
OutStreamer->EmitLabel(Stubs[i].first);
@@ -1573,7 +1558,7 @@ createPPCAsmPrinterPass(TargetMachine &tm,
}
// Force static initialization.
-extern "C" void LLVMInitializePowerPCAsmPrinter() {
+extern "C" void LLVMInitializePowerPCAsmPrinter() {
TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass);
diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 000000000000..7920240bc2b9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,253 @@
+//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+ "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+ "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+ "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+
+ static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+ SmallPtrSet<Value *, 8> Defs;
+ SmallVector<Value *, 8> WorkList;
+ WorkList.push_back(V);
+ Defs.insert(V);
+ while (!WorkList.empty()) {
+ Value *Curr = WorkList.back();
+ WorkList.pop_back();
+ if (User *CurrUser = dyn_cast<User>(Curr))
+ for (auto &Op : CurrUser->operands())
+ if (Defs.insert(Op).second)
+ WorkList.push_back(Op);
+ }
+ return Defs;
+ }
+
+ // Translate a i1 value to an equivalent i32 value:
+ static Value *translate(Value *V) {
+ Type *Int32Ty = Type::getInt32Ty(V->getContext());
+ if (Constant *C = dyn_cast<Constant>(V))
+ return ConstantExpr::getZExt(C, Int32Ty);
+ if (PHINode *P = dyn_cast<PHINode>(V)) {
+ // Temporarily set the operands to 0. We'll fix this later in
+ // runOnUse.
+ Value *Zero = Constant::getNullValue(Int32Ty);
+ PHINode *Q =
+ PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+ for (unsigned i = 0; i < P->getNumOperands(); ++i)
+ Q->addIncoming(Zero, P->getIncomingBlock(i));
+ return Q;
+ }
+
+ Argument *A = dyn_cast<Argument>(V);
+ Instruction *I = dyn_cast<Instruction>(V);
+ assert((A || I) && "Unknown value type");
+
+ auto InstPt =
+ A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+ return new ZExtInst(V, Int32Ty, "", InstPt);
+ }
+
+ typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+ // A PHINode is Promotable if:
+ // 1. Its type is i1 AND
+ // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+ // AND
+ // 3. All of its operands are Constant or Argument or
+ // CallInst or PHINode AND
+ // 4. All of its PHINode uses are Promotable AND
+ // 5. All of its PHINode operands are Promotable
+ static PHINodeSet getPromotablePHINodes(const Function &F) {
+ PHINodeSet Promotable;
+ // Condition 1
+ for (auto &BB : F)
+ for (auto &I : BB)
+ if (const PHINode *P = dyn_cast<PHINode>(&I))
+ if (P->getType()->isIntegerTy(1))
+ Promotable.insert(P);
+
+ SmallVector<const PHINode *, 8> ToRemove;
+ for (const auto &P : Promotable) {
+ // Condition 2 and 3
+ auto IsValidUser = [] (const Value *V) -> bool {
+ return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+ isa<DbgInfoIntrinsic>(V);
+ };
+ auto IsValidOperand = [] (const Value *V) -> bool {
+ return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+ isa<PHINode>(V);
+ };
+ const auto &Users = P->users();
+ const auto &Operands = P->operands();
+ if (!std::all_of(Users.begin(), Users.end(), IsValidUser) ||
+ !std::all_of(Operands.begin(), Operands.end(), IsValidOperand))
+ ToRemove.push_back(P);
+ }
+
+ // Iterate to convergence
+ auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+ const PHINode *Phi = dyn_cast<PHINode>(V);
+ return !Phi || Promotable.count(Phi);
+ };
+ while (!ToRemove.empty()) {
+ for (auto &User : ToRemove)
+ Promotable.erase(User);
+ ToRemove.clear();
+
+ for (const auto &P : Promotable) {
+ // Condition 4 and 5
+ const auto &Users = P->users();
+ const auto &Operands = P->operands();
+ if (!std::all_of(Users.begin(), Users.end(), IsPromotable) ||
+ !std::all_of(Operands.begin(), Operands.end(), IsPromotable))
+ ToRemove.push_back(P);
+ }
+ }
+
+ return Promotable;
+ }
+
+ typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+ static char ID;
+ PPCBoolRetToInt() : FunctionPass(ID) {
+ initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) {
+ PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+ B2IMap Bool2IntMap;
+ bool Changed = false;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (ReturnInst *R = dyn_cast<ReturnInst>(&I))
+ if (F.getReturnType()->isIntegerTy(1))
+ Changed |=
+ runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ for (auto &U : CI->operands())
+ if (U->getType()->isIntegerTy(1))
+ Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+ }
+ }
+
+ return Changed;
+ }
+
+ static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+ B2IMap &BoolToIntMap) {
+ auto Defs = findAllDefs(U);
+
+ // If the values are all Constants or Arguments, don't bother
+ if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>))
+ return false;
+
+ // Presently, we only know how to handle PHINode, Constant, and Arguments.
+ // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension
+ // could also be handled in the future.
+ for (const auto &V : Defs)
+ if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V))
+ return false;
+
+ for (const auto &V : Defs)
+ if (const PHINode *P = dyn_cast<PHINode>(V))
+ if (!PromotablePHINodes.count(P))
+ return false;
+
+ if (isa<ReturnInst>(U.getUser()))
+ ++NumBoolRetPromotion;
+ if (isa<CallInst>(U.getUser()))
+ ++NumBoolCallPromotion;
+ ++NumBoolToIntPromotion;
+
+ for (const auto &V : Defs)
+ if (!BoolToIntMap.count(V))
+ BoolToIntMap[V] = translate(V);
+
+ // Replace the operands of the translated instructions. There were set to
+ // zero in the translate function.
+ for (auto &Pair : BoolToIntMap) {
+ User *First = dyn_cast<User>(Pair.first);
+ User *Second = dyn_cast<User>(Pair.second);
+ assert((!First || Second) && "translated from user to non-user!?");
+ if (First)
+ for (unsigned i = 0; i < First->getNumOperands(); ++i)
+ Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+ }
+
+ Value *IntRetVal = BoolToIntMap[U];
+ Type *Int1Ty = Type::getInt1Ty(U->getContext());
+ Instruction *I = cast<Instruction>(U.getUser());
+ Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+ U.set(BackToBool);
+
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+ "Convert i1 constants to i32 if they are returned",
+ false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 940d55ac1f36..73a5305197ad 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -91,7 +91,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
unsigned FuncSize = 0;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
++MFI) {
- MachineBasicBlock *MBB = MFI;
+ MachineBasicBlock *MBB = &*MFI;
// The end of the previous block may have extra nops if this block has an
// alignment requirement.
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index fd150beeb5a9..b6ac4d54d4c7 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -98,7 +98,7 @@ namespace {
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolution>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
}
private:
@@ -112,6 +112,7 @@ namespace {
const DataLayout *DL;
DominatorTree *DT;
const TargetLibraryInfo *LibInfo;
+ bool PreserveLCSSA;
};
char PPCCTRLoops::ID = 0;
@@ -147,7 +148,7 @@ INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
false, false)
@@ -169,11 +170,12 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
bool PPCCTRLoops::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
bool MadeChange = false;
@@ -250,8 +252,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
// If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
// we're definitely using CTR.
case Intrinsic::ppc_is_decremented_ctr_nonzero:
- case Intrinsic::ppc_mtctr:
- return true;
+ case Intrinsic::ppc_mtctr:
+ return true;
// VisualStudio defines setjmp as _setjmp
#if defined(_MSC_VER) && defined(setjmp) && \
@@ -369,7 +371,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
true);
if (VTy == MVT::Other)
return true;
-
+
if (TLI->isOperationLegalOrCustom(Opcode, VTy))
continue;
else if (VTy.isVector() &&
@@ -537,7 +539,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
// the CTR register because some such uses might be reordered by the
// selection DAG after the mtctr instruction).
if (!Preheader || mightUseCTR(TT, Preheader))
- Preheader = InsertPreheaderForLoop(L, this);
+ Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
if (!Preheader)
return MadeChange;
@@ -554,10 +556,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
if (!ExitCount->getType()->isPointerTy() &&
ExitCount->getType() != CountType)
ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
- ExitCount = SE->getAddExpr(ExitCount,
- SE->getConstant(CountType, 1));
- Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
- Preheader->getTerminator());
+ ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+ Value *ECValue =
+ SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
IRBuilder<> CountBuilder(Preheader->getTerminator());
Module *M = Preheader->getParent()->getParent();
@@ -677,7 +678,7 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
// any other instructions that might clobber the ctr register.
for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
I != IE; ++I) {
- MachineBasicBlock *MBB = I;
+ MachineBasicBlock *MBB = &*I;
if (!MDT->isReachableFromEntry(MBB))
continue;
@@ -694,4 +695,3 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
return false;
}
#endif // NDEBUG
-
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index fc89753ed94e..7cb1bb54c725 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -71,15 +71,20 @@ protected:
for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
bool OtherReference = false, BlockChanged = false;
+
+ if ((*PI)->empty())
+ continue;
+
for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
- MachineInstrBuilder MIB;
+ if (J == (*PI)->end())
+ break;
+
if (J->getOpcode() == PPC::B) {
if (J->getOperand(0).getMBB() == &ReturnMBB) {
// This is an unconditional branch to the return. Replace the
// branch with a blr.
- MIB =
- BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()));
- MIB.copyImplicitOps(I);
+ BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
+ .copyImplicitOps(I);
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
@@ -90,10 +95,10 @@ protected:
if (J->getOperand(2).getMBB() == &ReturnMBB) {
// This is a conditional branch to the return. Replace the branch
// with a bclr.
- MIB = BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
- .addImm(J->getOperand(0).getImm())
- .addReg(J->getOperand(1).getReg());
- MIB.copyImplicitOps(I);
+ BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+ .addImm(J->getOperand(0).getImm())
+ .addReg(J->getOperand(1).getReg())
+ .copyImplicitOps(I);
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
@@ -104,11 +109,11 @@ protected:
if (J->getOperand(1).getMBB() == &ReturnMBB) {
// This is a conditional branch to the return. Replace the branch
// with a bclr.
- MIB = BuildMI(**PI, J, J->getDebugLoc(),
- TII->get(J->getOpcode() == PPC::BC ?
- PPC::BCLR : PPC::BCLRn))
- .addReg(J->getOperand(0).getReg());
- MIB.copyImplicitOps(I);
+ BuildMI(
+ **PI, J, J->getDebugLoc(),
+ TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
+ .addReg(J->getOperand(0).getReg())
+ .copyImplicitOps(I);
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
@@ -146,7 +151,7 @@ protected:
}
for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
- PredToRemove[i]->removeSuccessor(&ReturnMBB);
+ PredToRemove[i]->removeSuccessor(&ReturnMBB, true);
if (Changed && !ReturnMBB.hasAddressTaken()) {
// We now might be able to merge this blr-only block into its
@@ -156,7 +161,7 @@ protected:
if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) {
// Move the blr into the preceding block.
PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
- PrevMBB.removeSuccessor(&ReturnMBB);
+ PrevMBB.removeSuccessor(&ReturnMBB, true);
}
}
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 5f236f744fc4..b451ebf7f27a 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -164,7 +164,8 @@ class PPCFastISel final : public FastISel {
unsigned DestReg, bool IsZExt);
unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
- unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true);
+ unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+ bool UseSExt = true);
unsigned PPCMaterialize32BitInt(int64_t Imm,
const TargetRegisterClass *RC);
unsigned PPCMaterialize64BitInt(int64_t Imm,
@@ -292,10 +293,7 @@ bool PPCFastISel::isValueAvailable(const Value *V) const {
return true;
const auto *I = cast<Instruction>(V);
- if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
- return true;
-
- return false;
+ return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
}
// Given a value Obj, create an Address object Addr that represents its
@@ -527,9 +525,9 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
// VSX only provides an indexed load.
if (Is32VSXLoad || Is64VSXLoad) return false;
- MachineMemOperand *MMO =
- FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+ Addr.Offset),
MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
MFI.getObjectAlignment(Addr.Base.FI));
@@ -660,9 +658,9 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
// VSX only provides an indexed store.
if (Is32VSXStore || Is64VSXStore) return false;
- MachineMemOperand *MMO =
- FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+ Addr.Offset),
MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
MFI.getObjectAlignment(Addr.Base.FI));
@@ -774,8 +772,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
.addImm(PPCPred).addReg(CondReg).addMBB(TBB);
- fastEmitBranch(FBB, DbgLoc);
- FuncInfo.MBB->addSuccessor(TBB);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
} else if (const ConstantInt *CI =
@@ -1607,21 +1604,18 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
if (ValLocs.size() > 1)
return false;
- // Special case for returning a constant integer of any size.
- // Materialize the constant as an i64 and copy it to the return
- // register. We still need to worry about properly extending the sign. E.g:
- // If the constant has only one bit, it means it is a boolean. Therefore
- // we can't use PPCMaterializeInt because it extends the sign which will
- // cause negations of the returned value to be incorrect as they are
- // implemented as the flip of the least significant bit.
- if (isa<ConstantInt>(*RV)) {
- const Constant *C = cast<Constant>(RV);
-
+ // Special case for returning a constant integer of any size - materialize
+ // the constant as an i64 and copy it to the return register.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
CCValAssign &VA = ValLocs[0];
unsigned RetReg = VA.getLocReg();
- unsigned SrcReg = PPCMaterializeInt(C, MVT::i64,
- VA.getLocInfo() == CCValAssign::SExt);
+ // We still need to worry about properly extending the sign. For example,
+ // we could have only a single bit or a constant that needs zero
+ // extension rather than sign extension. Make sure we pass the return
+ // value extension property to integer materialization.
+ unsigned SrcReg =
+ PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
@@ -1761,8 +1755,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
const IndirectBrInst *IB = cast<IndirectBrInst>(I);
- for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
- FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+ for (const BasicBlock *SuccBB : IB->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
return true;
}
@@ -1898,10 +1892,9 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
CodeModel::Model CModel = TM.getCodeModel();
- MachineMemOperand *MMO =
- FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
- (VT == MVT::f32) ? 4 : 8, Align);
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
@@ -1976,19 +1969,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
- // If/when switches are implemented, jump tables should be handled
- // on the "if" path here.
- if (CModel == CodeModel::Large ||
- (GV->getType()->getElementType()->isFunctionTy() &&
- !GV->isStrongDefinitionForLinker()) ||
- GV->isDeclaration() || GV->hasCommonLinkage() ||
- GV->hasAvailableExternallyLinkage())
+ unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+ if (GVFlags & PPCII::MO_NLP_FLAG) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
DestReg).addGlobalAddress(GV).addReg(HighPartReg);
- else
+ } else {
// Otherwise generate the ADDItocL.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+ }
}
return DestReg;
@@ -2085,12 +2074,11 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
// Materialize an integer constant into a register, and return
// the register number (or zero if we failed to handle it).
-unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
- bool UseSExt) {
+unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+ bool UseSExt) {
// If we're using CR bit registers for i1 values, handle that as a special
// case first.
if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
- const ConstantInt *CI = cast<ConstantInt>(C);
unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
@@ -2105,12 +2093,17 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
&PPC::GPRCRegClass);
// If the constant is in range, use a load-immediate.
- const ConstantInt *CI = cast<ConstantInt>(C);
- if (isInt<16>(CI->getSExtValue())) {
+ if (UseSExt && isInt<16>(CI->getSExtValue())) {
+ unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+ unsigned ImmReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
+ .addImm(CI->getSExtValue());
+ return ImmReg;
+ } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
unsigned ImmReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
- .addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() );
+ .addImm(CI->getZExtValue());
return ImmReg;
}
@@ -2138,8 +2131,8 @@ unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
return PPCMaterializeFP(CFP, VT);
else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
return PPCMaterializeGV(GV, VT);
- else if (isa<ConstantInt>(C))
- return PPCMaterializeInt(C, VT, VT != MVT::i1);
+ else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+ return PPCMaterializeInt(CI, VT, VT != MVT::i1);
return 0;
}
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 08ae7174244a..beab844c6025 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
/// VRRegNo - Map from a numbered VR register to its enum value.
///
-static const uint16_t VRRegNo[] = {
+static const MCPhysReg VRRegNo[] = {
PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
@@ -270,7 +270,7 @@ static void RemoveVRSaveCode(MachineInstr *MI) {
// epilog blocks.
for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
// If last instruction is a return instruction, add an epilogue
- if (!I->empty() && I->back().isReturn()) {
+ if (I->isReturnBlock()) {
bool FoundIt = false;
for (MBBI = I->end(); MBBI != I->begin(); ) {
--MBBI;
@@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
DebugLoc dl = MI->getDebugLoc();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UsedRegMask = 0;
for (unsigned i = 0; i != 32; ++i)
- if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+ if (MRI.isPhysRegModified(VRRegNo[i]))
UsedRegMask |= 1 << (31-i);
// Live in and live out values already must be in the mask, so don't bother
@@ -325,7 +326,7 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
UsedRegMask != 0 && BI != BE; ++BI) {
const MachineBasicBlock &MBB = *BI;
- if (MBB.empty() || !MBB.back().isReturn())
+ if (!MBB.isReturnBlock())
continue;
const MachineInstr &Ret = MBB.back();
for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
@@ -555,9 +556,67 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
}
}
+bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
+ bool UseAtEnd,
+ unsigned *ScratchRegister) const {
+ RegScavenger RS;
+ unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+
+ if (ScratchRegister)
+ *ScratchRegister = R0;
+
+ // If MBB is an entry or exit block, use R0 as the scratch register
+ if ((UseAtEnd && MBB->isReturnBlock()) ||
+ (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
+ return true;
+
+ RS.enterBasicBlock(MBB);
+
+ if (UseAtEnd && !MBB->empty()) {
+ // The scratch register will be used at the end of the block, so must consider
+ // all registers used within the block
+
+ MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+ // If no terminator, back iterator up to previous instruction.
+ if (MBBI == MBB->end())
+ MBBI = std::prev(MBBI);
+
+ if (MBBI != MBB->begin())
+ RS.forward(MBBI);
+ }
+
+ if (!RS.isRegUsed(R0))
+ return true;
+
+ unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass
+ : &PPC::GPRCRegClass);
+
+ // Make sure the register scavenger was able to find an available register
+ // If not, use R0 but return false to indicate no register was available and
+ // R0 must be used (as recommended by the ABI)
+ if (Reg == 0)
+ return false;
+
+ if (ScratchRegister)
+ *ScratchRegister = Reg;
+
+ return true;
+}
+
+bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+ return findScratchRegister(TmpMBB, false, nullptr);
+}
+
+bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+ MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+ return findScratchRegister(TmpMBB, true, nullptr);
+}
+
void PPCFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineBasicBlock::iterator MBBI = MBB.begin();
MachineFrameInfo *MFI = MF.getFrameInfo();
const PPCInstrInfo &TII =
@@ -589,7 +648,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
}
}
- // Move MBBI back to the beginning of the function.
+ // Move MBBI back to the beginning of the prologue block.
MBBI = MBB.begin();
// Work out frame sizes.
@@ -613,7 +672,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
unsigned BPReg = RegInfo->getBaseRegister(MF);
unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR;
- unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0;
+ unsigned ScratchReg = 0;
unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
// ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
@@ -642,6 +701,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
"FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+ findScratchRegister(&MBB, false, &ScratchReg);
+ assert(ScratchReg && "No scratch register!");
+
int LROffset = getReturnSaveOffset();
int FPOffset = 0;
@@ -916,27 +978,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
}
void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI != MBB.end() && "Returning block has no terminator");
+ MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc dl;
+
+ if (MBBI != MBB.end())
+ dl = MBBI->getDebugLoc();
+
const PPCInstrInfo &TII =
*static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
const PPCRegisterInfo *RegInfo =
static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
- unsigned RetOpcode = MBBI->getOpcode();
- DebugLoc dl;
-
- assert((RetOpcode == PPC::BLR ||
- RetOpcode == PPC::BLR8 ||
- RetOpcode == PPC::TCRETURNri ||
- RetOpcode == PPC::TCRETURNdi ||
- RetOpcode == PPC::TCRETURNai ||
- RetOpcode == PPC::TCRETURNri8 ||
- RetOpcode == PPC::TCRETURNdi8 ||
- RetOpcode == PPC::TCRETURNai8) &&
- "Can only insert epilog into returning blocks");
-
// Get alignment info so we know how to restore the SP.
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -959,7 +1012,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1;
unsigned BPReg = RegInfo->getBaseRegister(MF);
unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
- unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0;
+ unsigned ScratchReg = 0;
unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
: PPC::MTLR );
@@ -973,10 +1026,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
: PPC::ADDI );
const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
: PPC::ADD4 );
-
+
int LROffset = getReturnSaveOffset();
int FPOffset = 0;
+
+ findScratchRegister(&MBB, true, &ScratchReg);
+ assert(ScratchReg && "No scratch register!");
+
if (HasFP) {
if (isSVR4ABI) {
MachineFrameInfo *FFI = MF.getFrameInfo();
@@ -1008,25 +1065,30 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
PBPOffset = FFI->getObjectOffset(PBPIndex);
}
- bool UsesTCRet = RetOpcode == PPC::TCRETURNri ||
- RetOpcode == PPC::TCRETURNdi ||
- RetOpcode == PPC::TCRETURNai ||
- RetOpcode == PPC::TCRETURNri8 ||
- RetOpcode == PPC::TCRETURNdi8 ||
- RetOpcode == PPC::TCRETURNai8;
-
- if (UsesTCRet) {
- int MaxTCRetDelta = FI->getTailCallSPDelta();
- MachineOperand &StackAdjust = MBBI->getOperand(1);
- assert(StackAdjust.isImm() && "Expecting immediate value.");
- // Adjust stack pointer.
- int StackAdj = StackAdjust.getImm();
- int Delta = StackAdj - MaxTCRetDelta;
- assert((Delta >= 0) && "Delta must be positive");
- if (MaxTCRetDelta>0)
- FrameSize += (StackAdj +Delta);
- else
- FrameSize += StackAdj;
+ bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn());
+
+ if (IsReturnBlock) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ bool UsesTCRet = RetOpcode == PPC::TCRETURNri ||
+ RetOpcode == PPC::TCRETURNdi ||
+ RetOpcode == PPC::TCRETURNai ||
+ RetOpcode == PPC::TCRETURNri8 ||
+ RetOpcode == PPC::TCRETURNdi8 ||
+ RetOpcode == PPC::TCRETURNai8;
+
+ if (UsesTCRet) {
+ int MaxTCRetDelta = FI->getTailCallSPDelta();
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int Delta = StackAdj - MaxTCRetDelta;
+ assert((Delta >= 0) && "Delta must be positive");
+ if (MaxTCRetDelta>0)
+ FrameSize += (StackAdj +Delta);
+ else
+ FrameSize += StackAdj;
+ }
}
// Frames of 32KB & larger require special handling because they cannot be
@@ -1066,7 +1128,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
.addImm(0)
.addReg(SPReg);
}
-
}
if (MustSaveLR)
@@ -1109,52 +1170,55 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
// Callee pop calling convention. Pop parameter/linkage area. Used for tail
// call optimization
- if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
- MF.getFunction()->getCallingConv() == CallingConv::Fast) {
- PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
- unsigned CallerAllocatedAmt = FI->getMinReservedArea();
-
- if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
- BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
- .addReg(SPReg).addImm(CallerAllocatedAmt);
- } else {
- BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+ if (IsReturnBlock) {
+ unsigned RetOpcode = MBBI->getOpcode();
+ if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
+ MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+ PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+ unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+
+ if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+ BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+ .addReg(SPReg).addImm(CallerAllocatedAmt);
+ } else {
+ BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
.addImm(CallerAllocatedAmt >> 16);
- BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+ BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
.addReg(ScratchReg, RegState::Kill)
.addImm(CallerAllocatedAmt & 0xFFFF);
- BuildMI(MBB, MBBI, dl, AddInst)
+ BuildMI(MBB, MBBI, dl, AddInst)
.addReg(SPReg)
.addReg(FPReg)
.addReg(ScratchReg);
- }
- } else if (RetOpcode == PPC::TCRETURNdi) {
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
- addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
- } else if (RetOpcode == PPC::TCRETURNri) {
- MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
- } else if (RetOpcode == PPC::TCRETURNai) {
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
- } else if (RetOpcode == PPC::TCRETURNdi8) {
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
- addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
- } else if (RetOpcode == PPC::TCRETURNri8) {
- MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
- } else if (RetOpcode == PPC::TCRETURNai8) {
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
- BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+ }
+ } else if (RetOpcode == PPC::TCRETURNdi) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ } else if (RetOpcode == PPC::TCRETURNri) {
+ MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+ } else if (RetOpcode == PPC::TCRETURNai) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+ } else if (RetOpcode == PPC::TCRETURNdi8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ } else if (RetOpcode == PPC::TCRETURNri8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+ } else if (RetOpcode == PPC::TCRETURNai8) {
+ MBBI = MBB.getLastNonDebugInstr();
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+ }
}
}
@@ -1200,8 +1264,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
// Reserve stack space for the PIC Base register (R30).
// Only used in SVR4 32-bit.
if (FI->usesPICBase()) {
- int PBPSI = FI->getPICBasePointerSaveIndex();
- PBPSI = MFI->CreateFixedObject(4, -8, true);
+ int PBPSI = MFI->CreateFixedObject(4, -8, true);
FI->setPICBasePointerSaveIndex(PBPSI);
}
@@ -1710,3 +1773,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
return true;
}
+
+bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+ MF.getSubtarget<PPCSubtarget>().isPPC64());
+}
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d6a389bfbf0d..bbe1329a5352 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -29,6 +29,30 @@ class PPCFrameLowering: public TargetFrameLowering {
const unsigned LinkageSize;
const unsigned BasePointerSaveOffset;
+ /**
+ * \brief Find a register that can be used in function prologue and epilogue
+ *
+ * Find a register that can be use as the scratch register in function
+ * prologue and epilogue to save various registers (Link Register, Base
+ * Pointer, etc.). Prefer R0, if it is available. If it is not available,
+ * then choose a different register.
+ *
+ * This method will return true if an available register was found (including
+ * R0). If no available registers are found, the method returns false and sets
+ * ScratchRegister to R0, as per the recommendation in the ABI.
+ *
+ * \param[in] MBB The machine basic block to find an available register for
+ * \param[in] UseAtEnd Specify whether the scratch register will be used at
+ * the end of the basic block (i.e., will the scratch
+ * register kill a register defined in the basic block)
+ * \param[out] ScratchRegister The scratch register to use
+ * \return true if a scratch register was found. false of a scratch register
+ * was not found and R0 is being used as the default.
+ */
+ bool findScratchRegister(MachineBasicBlock *MBB,
+ bool UseAtEnd,
+ unsigned *ScratchRegister) const;
+
public:
PPCFrameLowering(const PPCSubtarget &STI);
@@ -92,6 +116,13 @@ public:
const SpillSlot *
getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+ /// Methods used by shrink wrapping to determine if MBB can be used for the
+ /// function prologue/epilogue.
+ bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+ bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
};
} // End llvm namespace
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 932226842bb7..1eaa8118ba0a 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -16,6 +16,8 @@
#include "MCTargetDesc/PPCPredicates.h"
#include "PPCMachineFunctionInfo.h"
#include "PPCTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -52,6 +54,11 @@ static cl::opt<bool> BPermRewriterNoMasking(
"bit permutations"),
cl::Hidden);
+static cl::opt<bool> EnableBranchHint(
+ "ppc-use-branch-hint", cl::init(true),
+ cl::desc("Enable static hinting of branches on ppc"),
+ cl::Hidden);
+
namespace llvm {
void initializePPCDAGToDAGISelPass(PassRegistry&);
}
@@ -286,7 +293,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
// Find all return blocks, outputting a restore in each epilog.
for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
- if (!BB->empty() && BB->back().isReturn()) {
+ if (BB->isReturnBlock()) {
IP = BB->end(); --IP;
// Skip over all terminator instructions, which are part of the return
@@ -393,6 +400,55 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) {
return isInt32Immediate(N.getNode(), Imm);
}
+static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
+ const SDValue &DestMBB) {
+ assert(isa<BasicBlockSDNode>(DestMBB));
+
+ if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
+
+ const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+ const TerminatorInst *BBTerm = BB->getTerminator();
+
+ if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
+
+ const BasicBlock *TBB = BBTerm->getSuccessor(0);
+ const BasicBlock *FBB = BBTerm->getSuccessor(1);
+
+ auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB);
+ auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB);
+
+ // We only want to handle cases which are easy to predict at static time, e.g.
+ // C++ throw statement, that is very likely not taken, or calling never
+ // returned function, e.g. stdlib exit(). So we set Threshold to filter
+ // unwanted cases.
+ //
+ // Below is LLVM branch weight table, we only want to handle case 1, 2
+ //
+ // Case Taken:Nontaken Example
+ // 1. Unreachable 1048575:1 C++ throw, stdlib exit(),
+ // 2. Invoke-terminating 1:1048575
+ // 3. Coldblock 4:64 __builtin_expect
+ // 4. Loop Branch 124:4 For loop
+ // 5. PH/ZH/FPH 20:12
+ const uint32_t Threshold = 10000;
+
+ if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
+ return PPC::BR_NO_HINT;
+
+ DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
+ << BB->getName() << "'\n"
+ << " -> " << TBB->getName() << ": " << TProb << "\n"
+ << " -> " << FBB->getName() << ": " << FProb << "\n");
+
+ const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
+
+ // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities,
+ // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock
+ if (BBDN->getBasicBlock()->getBasicBlock() != TBB)
+ std::swap(TProb, FProb);
+
+ return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT;
+}
// isOpcWithIntImmediate - This method tests to see if the node is a specific
// opcode and that it has a immediate integer right operand.
@@ -564,7 +620,6 @@ static unsigned SelectInt64CountDirect(int64_t Imm) {
// Handle first 32 bits.
unsigned Lo = Imm & 0xFFFF;
- unsigned Hi = (Imm >> 16) & 0xFFFF;
// Simple value.
if (isInt<16>(Imm)) {
@@ -586,9 +641,9 @@ static unsigned SelectInt64CountDirect(int64_t Imm) {
++Result;
// Add in the last bits as required.
- if ((Hi = (Remainder >> 16) & 0xFFFF))
+ if ((Remainder >> 16) & 0xFFFF)
++Result;
- if ((Lo = Remainder & 0xFFFF))
+ if (Remainder & 0xFFFF)
++Result;
return Result;
@@ -1028,7 +1083,7 @@ class BitPermutationSelector {
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
- DEBUG(dbgs() << "\tcombining final bit group with inital one\n");
+ DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
BitGroups.erase(BitGroups.begin());
}
@@ -1557,10 +1612,7 @@ class BitPermutationSelector {
return false;
}
- if (VRI.RLAmt != EffRLAmt)
- return false;
-
- return true;
+ return VRI.RLAmt == EffRLAmt;
};
for (auto &BG : BitGroups) {
@@ -2781,7 +2833,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
N->getValueType(0) == MVT::v2i64)) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-
+
SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
unsigned DM[2];
@@ -2798,7 +2850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
SDValue Base, Offset;
- if (LD->isUnindexed() &&
+ if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() &&
(LD->getMemoryVT() == MVT::f64 ||
LD->getMemoryVT() == MVT::i64) &&
SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
@@ -2841,8 +2893,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
// Op #3 is the Dest MBB
// Op #4 is the Flag.
// Prevent PPC::PRED_* from being selected into LI.
- SDValue Pred =
- getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(), dl);
+ unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ if (EnableBranchHint)
+ PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3));
+
+ SDValue Pred = getI32Imm(PCC, dl);
SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
N->getOperand(0), N->getOperand(4) };
return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
@@ -2871,6 +2926,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
BitComp, N->getOperand(4), N->getOperand(0));
}
+ if (EnableBranchHint)
+ PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4));
+
SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
N->getOperand(4), N->getOperand(0) };
@@ -2903,9 +2961,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
break;
// The first source operand is a TargetGlobalAddress or a TargetJumpTable.
- // If it is an externally defined symbol, a symbol with common linkage,
- // a non-local function address, or a jump table address, or if we are
- // generating code for large code model, we generate:
+ // If it must be toc-referenced according to PPCSubTarget, we generate:
// LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
// Otherwise we generate:
// ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
@@ -2920,13 +2976,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
MVT::i64, GA, SDValue(Tmp, 0)));
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
- const GlobalValue *GValue = G->getGlobal();
- if ((GValue->getType()->getElementType()->isFunctionTy() &&
- !GValue->isStrongDefinitionForLinker()) ||
- GValue->isDeclaration() || GValue->hasCommonLinkage() ||
- GValue->hasAvailableExternallyLinkage())
+ const GlobalValue *GV = G->getGlobal();
+ unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+ if (GVFlags & PPCII::MO_NLP_FLAG) {
return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
MVT::i64, GA, SDValue(Tmp, 0)));
+ }
}
return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
@@ -3110,7 +3165,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
if (!CurDAG->MaskedValueIsZero(Op0,
APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
return false;
-
+
LHS = Op0.getOperand(0);
RHS = Op0.getOperand(1);
return true;
@@ -3305,7 +3360,7 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = --Position;
+ SDNode *N = &*--Position;
if (N->use_empty())
continue;
@@ -3989,7 +4044,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = --Position;
+ SDNode *N = &*--Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
continue;
@@ -4145,7 +4200,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
++Position;
while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = --Position;
+ SDNode *N = &*--Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
continue;
@@ -4184,16 +4239,24 @@ void PPCDAGToDAGISel::PeepholePPC64() {
break;
}
- // If this is a load or store with a zero offset, we may be able to
- // fold an add-immediate into the memory operation.
- if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
- N->getConstantOperandVal(FirstOp) != 0)
+ // If this is a load or store with a zero offset, or within the alignment,
+ // we may be able to fold an add-immediate into the memory operation.
+ // The check against alignment is below, as it can't occur until we check
+ // the arguments to N
+ if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
continue;
SDValue Base = N->getOperand(FirstOp + 1);
if (!Base.isMachineOpcode())
continue;
+ // On targets with fusion, we don't want this to fire and remove a fusion
+ // opportunity, unless a) it results in another fusion opportunity or
+ // b) optimizing for size.
+ if (PPCSubTarget->hasFusion() &&
+ (!MF->getFunction()->optForSize() && !Base.hasOneUse()))
+ continue;
+
unsigned Flags = 0;
bool ReplaceFlags = true;
@@ -4237,6 +4300,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
break;
}
+ SDValue ImmOpnd = Base.getOperand(1);
+ int MaxDisplacement = 0;
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+ const GlobalValue *GV = GA->getGlobal();
+ MaxDisplacement = GV->getAlignment() - 1;
+ }
+
+ int Offset = N->getConstantOperandVal(FirstOp);
+ if (Offset < 0 || Offset > MaxDisplacement)
+ continue;
+
// We found an opportunity. Reverse the operands from the add
// immediate and substitute them into the load or store. If
// needed, update the target flags for the immediate operand to
@@ -4247,8 +4321,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
DEBUG(N->dump(CurDAG));
DEBUG(dbgs() << "\n");
- SDValue ImmOpnd = Base.getOperand(1);
-
// If the relocation information isn't already present on the
// immediate operand, add it now.
if (ReplaceFlags) {
@@ -4259,17 +4331,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
// is insufficient for the instruction encoding.
if (GV->getAlignment() < 4 &&
(StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
- StorageOpcode == PPC::LWA)) {
+ StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
continue;
}
- ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+ ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
} else if (ConstantPoolSDNode *CP =
dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
const Constant *C = CP->getConstVal();
ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
CP->getAlignment(),
- 0, Flags);
+ Offset, Flags);
}
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1b8f8fb2f45b..af9ad077a7ce 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -42,10 +42,6 @@
using namespace llvm;
-// FIXME: Remove this once soft-float is supported.
-static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic",
-cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden);
-
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@@ -72,8 +68,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// Set up the register classes.
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
- addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
- addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+ if (!Subtarget.useSoftFloat()) {
+ addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+ addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+ }
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD
for (MVT VT : MVT::integer_valuetypes()) {
@@ -107,8 +105,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
isPPC64 ? MVT::i64 : MVT::i32);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
- AddPromotedToType (ISD::UINT_TO_FP, MVT::i1,
- isPPC64 ? MVT::i64 : MVT::i32);
+ AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
} else {
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
@@ -257,10 +255,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
- setOperationAction(ISD::BITCAST, MVT::f32, Expand);
- setOperationAction(ISD::BITCAST, MVT::i32, Expand);
- setOperationAction(ISD::BITCAST, MVT::i64, Expand);
- setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+ if (Subtarget.hasDirectMove()) {
+ setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+ setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+ setOperationAction(ISD::BITCAST, MVT::i64, Legal);
+ setOperationAction(ISD::BITCAST, MVT::f64, Legal);
+ } else {
+ setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+ setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+ }
// We cannot sextinreg(i1). Expand to shifts.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -329,6 +334,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
+ setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -403,9 +410,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// will selectively turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::vector_valuetypes()) {
// add/sub are legal for all supported vector VT's.
- setOperationAction(ISD::ADD , VT, Legal);
- setOperationAction(ISD::SUB , VT, Legal);
-
+ setOperationAction(ISD::ADD, VT, Legal);
+ setOperationAction(ISD::SUB, VT, Legal);
+
// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
setOperationAction(ISD::CTPOP, VT, Legal);
@@ -477,6 +484,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@@ -519,12 +528,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
}
-
- if (Subtarget.hasP8Altivec())
+ if (Subtarget.hasP8Altivec())
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
else
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
-
+
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -545,6 +553,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (Subtarget.hasVSX()) {
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+ if (Subtarget.hasP8Vector()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
+ }
+ if (Subtarget.hasDirectMove()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+ }
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -813,15 +836,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLibcallName(RTLIB::SRA_I128, nullptr);
}
- if (isPPC64) {
- setStackPointerRegisterToSaveRestore(PPC::X1);
- setExceptionPointerRegister(PPC::X3);
- setExceptionSelectorRegister(PPC::X4);
- } else {
- setStackPointerRegisterToSaveRestore(PPC::R1);
- setExceptionPointerRegister(PPC::R3);
- setExceptionSelectorRegister(PPC::R4);
- }
+ setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -942,9 +957,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ for (auto *EltTy : STy->elements()) {
unsigned EltAlign = 0;
- getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+ getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == MaxMaxAlign)
@@ -969,6 +984,10 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
return Align;
}
+bool PPCTargetLowering::useSoftFloat() const {
+ return Subtarget.useSoftFloat();
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -992,6 +1011,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
+ case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
case PPCISD::SRL: return "PPCISD::SRL";
case PPCISD::SRA: return "PPCISD::SRA";
@@ -1236,7 +1256,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
-/// The ShuffleKind distinguishes between big-endian merges with two
+/// The ShuffleKind distinguishes between big-endian merges with two
/// different inputs (0), either-endian merges with two identical inputs (1),
/// and little-endian merges with two different inputs (2). For the latter,
/// the input operands are swapped (see PPCInstrAltivec.td).
@@ -1261,7 +1281,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
-/// The ShuffleKind distinguishes between big-endian merges with two
+/// The ShuffleKind distinguishes between big-endian merges with two
/// different inputs (0), either-endian merges with two identical inputs (1),
/// and little-endian merges with two different inputs (2). For the latter,
/// the input operands are swapped (see PPCInstrAltivec.td).
@@ -1353,7 +1373,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
* - 2 = little-endian merge with two different inputs (inputs are swapped for
* little-endian merges).
* \param[in] DAG The current SelectionDAG
- * \return true iff this shuffle mask
+ * \return true iff this shuffle mask
*/
bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
unsigned ShuffleKind, SelectionDAG &DAG) {
@@ -1380,7 +1400,7 @@ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
/// amount, otherwise return -1.
-/// The ShuffleKind distinguishes between big-endian operations with two
+/// The ShuffleKind distinguishes between big-endian operations with two
/// different inputs (0), either-endian operations with two identical inputs
/// (1), and little-endian operations with two different inputs (2). For the
/// latter, the input operands are swapped (see PPCInstrAltivec.td).
@@ -1513,8 +1533,8 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
for (unsigned i = 0; i != Multiple-1; ++i) {
if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
- LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
- LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+ LeadingZero &= isNullConstant(UniquedVals[i]);
+ LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
}
// Finally, check the least significant entry.
if (LeadingZero) {
@@ -1629,7 +1649,6 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
}
-
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
/// can be more efficiently represented with [r+imm].
@@ -1998,10 +2017,10 @@ static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit,
DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
SDValue Ops[] = { GA, Reg };
- return DAG.getMemIntrinsicNode(PPCISD::TOC_ENTRY, dl,
- DAG.getVTList(VT, MVT::Other), Ops, VT,
- MachinePointerInfo::getGOT(), 0, false, true,
- false, 0);
+ return DAG.getMemIntrinsicNode(
+ PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
+ false, 0);
}
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
@@ -2092,6 +2111,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
// large models could be added if users need it, at the cost of
// additional complexity.
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2480,7 +2502,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
// */
// } va_list[1];
-
SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
@@ -2536,7 +2557,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
#include "PPCGenCallingConv.inc"
-// Function whose sole purpose is to kill compiler warnings
+// Function whose sole purpose is to kill compiler warnings
// stemming from unused functions included from PPCGenCallingConv.inc.
CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
@@ -2933,8 +2954,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
PPC::F8
};
unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
- if (DisablePPCFloatInVariadic)
- NumFPArgRegs = 0;
+
+ if (Subtarget.useSoftFloat())
+ NumFPArgRegs = 0;
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
@@ -3177,15 +3199,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
EVT ObjType = (ObjSize == 1 ? MVT::i8 :
(ObjSize == 2 ? MVT::i16 : MVT::i32));
Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
- MachinePointerInfo(FuncArg),
- ObjType, false, false, 0);
+ MachinePointerInfo(&*FuncArg), ObjType,
+ false, false, 0);
} else {
// For sizes that don't fit a truncating store (3, 5, 6, 7),
// store the whole register as-is to the parameter save area
// slot.
- Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(FuncArg),
- false, false, 0);
+ Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(&*FuncArg), false, false, 0);
}
MemOps.push_back(Store);
@@ -3212,9 +3234,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
SDValue Off = DAG.getConstant(j, dl, PtrVT);
Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
}
- SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
- MachinePointerInfo(FuncArg, j),
- false, false, 0);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, Addr,
+ MachinePointerInfo(&*FuncArg, j), false, false, 0);
MemOps.push_back(Store);
++GPR_idx;
}
@@ -3592,7 +3614,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(FuncArg),
+ MachinePointerInfo(&*FuncArg),
ObjType, false, false, 0);
MemOps.push_back(Store);
++GPR_idx;
@@ -3615,9 +3637,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
- SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(FuncArg, j),
- false, false, 0);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo(&*FuncArg, j), false, false, 0);
MemOps.push_back(Store);
++GPR_idx;
ArgOffset += PtrByteSize;
@@ -3880,7 +3902,6 @@ struct TailCallArgumentInfo {
TailCallArgumentInfo() : FrameIdx(0) {}
};
-
}
/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
@@ -3895,9 +3916,10 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
SDValue FIN = TailCallArgs[i].FrameIdxOp;
int FI = TailCallArgs[i].FrameIdx;
// Store relative to framepointer.
- MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
+ MemOpChains.push_back(DAG.getStore(
+ Chain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+ false, 0));
}
}
@@ -3922,9 +3944,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
NewRetAddrLoc, true);
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
- Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
- MachinePointerInfo::getFixedStack(NewRetAddr),
- false, false, 0);
+ Chain = DAG.getStore(
+ Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr),
+ false, false, 0);
// When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
// slot as the FP is never overwritten.
@@ -3933,9 +3956,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
true);
SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
- Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
- MachinePointerInfo::getFixedStack(NewFPIdx),
- false, false, 0);
+ Chain = DAG.getStore(
+ Chain, dl, OldFP, NewFramePtrIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx),
+ false, false, 0);
}
}
return Chain;
@@ -4812,8 +4836,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
continue;
break;
case MVT::v4f32:
- // When using QPX, this is handled like a FP register, otherwise, it
- // is an Altivec register.
+ // When using QPX, this is handled like a FP register, otherwise, it
+ // is an Altivec register.
if (Subtarget.hasQPX()) {
if (++NumFPRsUsed <= NumFPRs)
continue;
@@ -5318,9 +5342,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
- Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
- MachinePointerInfo::getStack(TOCSaveOffset),
- false, false, 0);
+ Chain = DAG.getStore(
+ Val.getValue(1), dl, Val, AddPtr,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset),
+ false, false, 0);
// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
// This does not mean the MTCTR instruction must use R12; it's easier
// to model this as an extra parameter, so do that.
@@ -5341,9 +5366,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
FPOp, true, TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
- hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
- Callee, SPDiff, NumBytes, Ins, InVals, CS);
+ return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest,
+ DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
+ SPDiff, NumBytes, Ins, InVals, CS);
}
SDValue
@@ -5798,6 +5823,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
}
+SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
+ SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const {
+ SDLoc dl(Op);
+
+ // Get the corect type for integers.
+ EVT IntVT = Op.getValueType();
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+ // Build a DYNAREAOFFSET node.
+ SDValue Ops[2] = {Chain, FPSIdx};
+ SDVTList VTs = DAG.getVTList(IntVT);
+ return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
const PPCSubtarget &Subtarget) const {
// When we pop the dynamic allocation we need to restore the SP link.
@@ -5828,10 +5869,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
false, false, 0);
}
-
-
-SDValue
-PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool isPPC64 = Subtarget.isPPC64();
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
@@ -5983,6 +6021,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
if (!DAG.getTarget().Options.NoInfsFPMath ||
!DAG.getTarget().Options.NoNaNsFPMath)
return Op;
+ // TODO: Propagate flags from the select rather than global settings.
+ SDNodeFlags Flags;
+ Flags.setNoInfs(true);
+ Flags.setNoNaNs(true);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -6033,7 +6075,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETNE:
std::swap(TV, FV);
case ISD::SETEQ:
- Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6043,25 +6085,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
case ISD::SETULT:
case ISD::SETLT:
- Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
case ISD::SETOGE:
case ISD::SETGE:
- Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
case ISD::SETUGT:
case ISD::SETGT:
- Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
case ISD::SETOLE:
case ISD::SETLE:
- Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+ Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6101,7 +6143,8 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
(Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
- MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Emit a store to the stack slot.
SDValue Chain;
@@ -6291,11 +6334,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
+
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64);
- FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
- FPHalfs, FPHalfs, FPHalfs, FPHalfs);
-
+ FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs,
+ FPHalfs, FPHalfs);
+
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
if (Op.getValueType() != MVT::v4f64)
@@ -6421,17 +6464,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
- MachinePointerInfo::getFixedStack(FrameIdx),
- false, false, 0);
+ SDValue Store = DAG.getStore(
+ DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+ false, false, 0);
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
RLI.Chain = Store;
- RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+ RLI.MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = 4;
MachineMemOperand *MMO =
@@ -6472,16 +6516,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
- MachinePointerInfo::getFixedStack(FrameIdx),
- false, false, 0);
+ SDValue Store = DAG.getStore(
+ DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+ false, false, 0);
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
RLI.Chain = Store;
- RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx);
+ RLI.MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = 4;
}
@@ -6506,14 +6552,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
Op.getOperand(0));
// STD the extended value into the stack slot.
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx,
- MachinePointerInfo::getFixedStack(FrameIdx),
- false, false, 0);
+ SDValue Store = DAG.getStore(
+ DAG.getEntryNode(), dl, Ext64, FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+ false, false, 0);
// Load the value as a double.
- Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx,
- MachinePointerInfo::getFixedStack(FrameIdx),
- false, false, false, 0);
+ Ld = DAG.getLoad(
+ MVT::f64, dl, Store, FIdx,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx),
+ false, false, false, 0);
}
// FCFID it and return it.
@@ -6735,7 +6783,6 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
}
-
/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
/// amount. The result has the specified value type.
static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
@@ -6768,7 +6815,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// to a zero vector to get the boolean result.
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
- MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -6794,8 +6842,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
for (unsigned i = 0; i < 4; ++i) {
if (BVN->getOperand(i).getOpcode() == ISD::UNDEF)
CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
- else if (cast<ConstantSDNode>(BVN->getOperand(i))->
- getConstantIntValue()->isZero())
+ else if (isNullConstant(BVN->getOperand(i)))
continue;
else
CV[i] = One;
@@ -6814,9 +6861,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
ValueVTs.push_back(MVT::Other); // chain
SDVTList VTs = DAG.getVTList(ValueVTs);
- return DAG.getMemIntrinsicNode(PPCISD::QVLFSb,
- dl, VTs, Ops, MVT::v4f32,
- MachinePointerInfo::getConstantPool());
+ return DAG.getMemIntrinsicNode(
+ PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
SmallVector<SDValue, 4> Stores;
@@ -6915,7 +6962,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (SextVal >= -16 && SextVal <= 15)
return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
-
// Two instruction sequences.
// If this value is in the range [-32,30] and is even, use:
@@ -7304,11 +7350,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
V1, V2, VPermMask);
}
-/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
-/// altivec comparison. If it is, return true and fill in Opc/isDot with
+/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
+/// vector comparison. If it is, return true and fill in Opc/isDot with
/// information about the intrinsic.
-static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
- bool &isDot, const PPCSubtarget &Subtarget) {
+static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
+ bool &isDot, const PPCSubtarget &Subtarget) {
unsigned IntrinsicID =
cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
CompareOpc = -1;
@@ -7321,12 +7367,11 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
- case Intrinsic::ppc_altivec_vcmpequd_p:
+ case Intrinsic::ppc_altivec_vcmpequd_p:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 199;
- isDot = 1;
- }
- else
+ CompareOpc = 199;
+ isDot = 1;
+ } else
return false;
break;
@@ -7335,28 +7380,48 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
- case Intrinsic::ppc_altivec_vcmpgtsd_p:
+ case Intrinsic::ppc_altivec_vcmpgtsd_p:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 967;
- isDot = 1;
- }
- else
+ CompareOpc = 967;
+ isDot = 1;
+ } else
return false;
break;
case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
- case Intrinsic::ppc_altivec_vcmpgtud_p:
+ case Intrinsic::ppc_altivec_vcmpgtud_p:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 711;
- isDot = 1;
+ CompareOpc = 711;
+ isDot = 1;
+ } else
+ return false;
+
+ break;
+ // VSX predicate comparisons use the same infrastructure
+ case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+ case Intrinsic::ppc_vsx_xvcmpgedp_p:
+ case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+ case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+ case Intrinsic::ppc_vsx_xvcmpgesp_p:
+ case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+ if (Subtarget.hasVSX()) {
+ switch (IntrinsicID) {
+ case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
+ case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
+ case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
+ case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
+ case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
+ case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+ }
+ isDot = 1;
}
- else
+ else
return false;
break;
-
+
// Normal Comparisons.
case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break;
@@ -7365,10 +7430,9 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpequd:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 199;
- isDot = 0;
- }
- else
+ CompareOpc = 199;
+ isDot = 0;
+ } else
return false;
break;
@@ -7377,24 +7441,22 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break;
- case Intrinsic::ppc_altivec_vcmpgtsd:
+ case Intrinsic::ppc_altivec_vcmpgtsd:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 967;
- isDot = 0;
- }
- else
+ CompareOpc = 967;
+ isDot = 0;
+ } else
return false;
break;
case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break;
case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break;
- case Intrinsic::ppc_altivec_vcmpgtud:
+ case Intrinsic::ppc_altivec_vcmpgtud:
if (Subtarget.hasP8Altivec()) {
- CompareOpc = 711;
- isDot = 0;
- }
- else
+ CompareOpc = 711;
+ isDot = 0;
+ } else
return false;
break;
@@ -7411,7 +7473,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc dl(Op);
int CompareOpc;
bool isDot;
- if (!getAltivecCompareInfo(Op, CompareOpc, isDot, Subtarget))
+ if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
return SDValue(); // Don't custom lower most intrinsics.
// If this is a non-dot comparison, make the VCMP node and we are done.
@@ -7536,7 +7598,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
FPHalfs, FPHalfs, FPHalfs, FPHalfs);
- Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+ Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
// Now convert to an integer and store.
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
@@ -7545,7 +7607,8 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
- MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -7752,7 +7815,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64,
FPHalfs, FPHalfs, FPHalfs, FPHalfs);
- Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+ Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
// Now convert to an integer and store.
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
@@ -7761,7 +7824,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
- MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -7798,11 +7862,10 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
- Stores.push_back(DAG.getTruncStore(StoreChain, dl, Loads[i], Idx,
- SN->getPointerInfo().getWithOffset(i),
- MVT::i8 /* memory type */,
- SN->isNonTemporal(), SN->isVolatile(),
- 1 /* alignment */, SN->getAAInfo()));
+ Stores.push_back(DAG.getTruncStore(
+ StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
+ MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(),
+ 1 /* alignment */, SN->getAAInfo()));
}
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
@@ -7906,6 +7969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
+ case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -7971,7 +8035,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
N->getValueType(0));
SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
- N->getOperand(1));
+ N->getOperand(1));
Results.push_back(NewInt);
Results.push_back(NewInt.getValue(1));
@@ -8020,7 +8084,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-
//===----------------------------------------------------------------------===//
// Other Lowering Code
//===----------------------------------------------------------------------===//
@@ -8089,8 +8152,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
unsigned dest = MI->getOperand(0).getReg();
unsigned ptrA = MI->getOperand(1).getReg();
@@ -8160,8 +8222,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
unsigned dest = MI->getOperand(0).getReg();
unsigned ptrA = MI->getOperand(1).getReg();
@@ -8283,8 +8344,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
+ MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -8384,8 +8444,8 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
.addMBB(mainMBB);
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
- thisMBB->addSuccessor(mainMBB, /* weight */ 0);
- thisMBB->addSuccessor(sinkMBB, /* weight */ 1);
+ thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
+ thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
// mainMBB:
// mainDstReg = 0
@@ -8562,8 +8622,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// To "insert" these instructions we actually have to insert their
// control-flow patterns.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
MachineFunction *F = BB->getParent();
@@ -8675,7 +8734,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// mfspr Rx,TBU # load from TBU
// mfspr Ry,TB # load from TB
// mfspr Rz,TBU # load from TBU
- // cmpw crX,Rx,Rz # check if ‘old’=’new’
+ // cmpw crX,Rx,Rz # check if 'old'='new'
// bne readLoop # branch if they're not equal
// ...
@@ -9137,7 +9196,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
return SDValue();
}
-bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
// Note: This functionality is used only when unsafe-fp-math is enabled, and
// on cores with reciprocal estimates (which are used when unsafe-fp-math is
// enabled for division), this functionality is redundant with the default
@@ -9150,12 +9209,26 @@ bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
// one FP pipeline) for three or more FDIVs (for generic OOO cores).
switch (Subtarget.getDarwinDirective()) {
default:
- return NumUsers > 2;
+ return 3;
case PPC::DIR_440:
case PPC::DIR_A2:
case PPC::DIR_E500mc:
case PPC::DIR_E5500:
- return NumUsers > 1;
+ return 2;
+ }
+}
+
+// isConsecutiveLSLoc needs to work even if all adds have not yet been
+// collapsed, and so we need to look through chains of them.
+static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
+ int64_t& Offset, SelectionDAG &DAG) {
+ if (DAG.isBaseWithConstantOffset(Loc)) {
+ Base = Loc.getOperand(0);
+ Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+
+ // The base might itself be a base plus an offset, and if so, accumulate
+ // that as well.
+ getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
}
}
@@ -9178,16 +9251,18 @@ static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
}
- // Handle X+C
- if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
- cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+ SDValue Base1 = Loc, Base2 = BaseLoc;
+ int64_t Offset1 = 0, Offset2 = 0;
+ getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
+ getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
+ if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
return true;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const GlobalValue *GV1 = nullptr;
const GlobalValue *GV2 = nullptr;
- int64_t Offset1 = 0;
- int64_t Offset2 = 0;
+ Offset1 = 0;
+ Offset2 = 0;
bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
if (isGA1 && isGA2 && GV1 == GV2)
@@ -9343,7 +9418,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
IE = LoadRoots.end(); I != IE; ++I) {
Queue.push_back(*I);
-
+
while (!Queue.empty()) {
SDNode *LoadRoot = Queue.pop_back_val();
if (!Visited.insert(LoadRoot).second)
@@ -9470,7 +9545,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
}
// Visit all inputs, collect all binary operations (and, or, xor and
- // select) that are all fed by extensions.
+ // select) that are all fed by extensions.
while (!BinOps.empty()) {
SDValue BinOp = BinOps.back();
BinOps.pop_back();
@@ -9492,7 +9567,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
isa<ConstantSDNode>(BinOp.getOperand(i))) {
- Inputs.push_back(BinOp.getOperand(i));
+ Inputs.push_back(BinOp.getOperand(i));
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
BinOp.getOperand(i).getOpcode() == ISD::OR ||
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
@@ -9572,7 +9647,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
if (isa<ConstantSDNode>(Inputs[i]))
continue;
else
- DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
+ DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
}
// Replace all operations (these are all the same, but have a different
@@ -9682,7 +9757,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
SmallPtrSet<SDNode *, 16> Visited;
// Visit all inputs, collect all binary operations (and, or, xor and
- // select) that are all fed by truncations.
+ // select) that are all fed by truncations.
while (!BinOps.empty()) {
SDValue BinOp = BinOps.back();
BinOps.pop_back();
@@ -9701,7 +9776,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
isa<ConstantSDNode>(BinOp.getOperand(i))) {
- Inputs.push_back(BinOp.getOperand(i));
+ Inputs.push_back(BinOp.getOperand(i));
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
BinOp.getOperand(i).getOpcode() == ISD::OR ||
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
@@ -9915,10 +9990,11 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
"Invalid extension type");
EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
SDValue ShiftCst =
- DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
- return DAG.getNode(ISD::SRA, dl, N->getValueType(0),
- DAG.getNode(ISD::SHL, dl, N->getValueType(0),
- N->getOperand(0), ShiftCst), ShiftCst);
+ DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
+ return DAG.getNode(
+ ISD::SRA, dl, N->getValueType(0),
+ DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
+ ShiftCst);
}
SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
@@ -10102,16 +10178,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default: break;
case PPCISD::SHL:
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
- if (C->isNullValue()) // 0 << V -> 0.
+ if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
- }
break;
case PPCISD::SRL:
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
- if (C->isNullValue()) // 0 >>u V -> 0.
+ if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
return N->getOperand(0);
- }
break;
case PPCISD::SRA:
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
@@ -10122,7 +10194,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
- case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND:
return DAGCombineExtBoolTrunc(N, DCI);
case ISD::TRUNCATE:
case ISD::SETCC:
@@ -10277,7 +10349,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// original unaligned load.
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *BaseMMO =
- MF.getMachineMemOperand(LD->getMemOperand(), -MemVT.getStoreSize()+1,
+ MF.getMachineMemOperand(LD->getMemOperand(),
+ -(long)MemVT.getStoreSize()+1,
2*MemVT.getStoreSize()-1);
// Create the new base load.
@@ -10527,7 +10600,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BRCOND: {
SDValue Cond = N->getOperand(1);
SDValue Target = N->getOperand(2);
-
+
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
Intrinsic::ppc_is_decremented_ctr_nonzero) {
@@ -10558,8 +10631,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
Intrinsic::ppc_is_decremented_ctr_nonzero &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
- !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
- isZero())
+ !isNullConstant(LHS.getOperand(1)))
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
@@ -10588,7 +10660,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
- getAltivecCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
+ getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
assert(isDot && "Can't compare against a vector result!");
// If this is a comparison against something other than 0/1, then we know
@@ -10739,8 +10811,11 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// boundary so that the entire loop fits in one instruction-cache line.
uint64_t LoopSize = 0;
for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
- for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J)
+ for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
LoopSize += TII->GetInstSizeInBytes(J);
+ if (LoopSize > 32)
+ break;
+ }
if (LoopSize > 16 && LoopSize <= 32)
return 5;
@@ -10868,17 +10943,19 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &PPC::QFRCRegClass);
if (VT == MVT::v4f32 && Subtarget.hasQPX())
return std::make_pair(0U, &PPC::QSRCRegClass);
- return std::make_pair(0U, &PPC::VRRCRegClass);
+ if (Subtarget.hasAltivec())
+ return std::make_pair(0U, &PPC::VRRCRegClass);
case 'y': // crrc
return std::make_pair(0U, &PPC::CRRCRegClass);
}
- } else if (Constraint == "wc") { // an individual CR bit.
+ } else if (Constraint == "wc" && Subtarget.useCRBits()) {
+ // An individual CR bit.
return std::make_pair(0U, &PPC::CRBITRCRegClass);
- } else if (Constraint == "wa" || Constraint == "wd" ||
- Constraint == "wf") {
+ } else if ((Constraint == "wa" || Constraint == "wd" ||
+ Constraint == "wf") && Subtarget.hasVSX()) {
return std::make_pair(0U, &PPC::VSRCRegClass);
- } else if (Constraint == "ws") {
- if (VT == MVT::f32)
+ } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+ if (VT == MVT::f32 && Subtarget.hasP8Vector())
return std::make_pair(0U, &PPC::VSSRCRegClass);
else
return std::make_pair(0U, &PPC::VSFRCRegClass);
@@ -10908,7 +10985,6 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return R;
}
-
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
@@ -11358,9 +11434,7 @@ bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
- if (BitSize == 0 || BitSize > 64)
- return false;
- return true;
+ return !(BitSize == 0 || BitSize > 64);
}
bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
@@ -11477,11 +11551,21 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
return ScratchRegs;
}
+unsigned PPCTargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
+}
+
+unsigned PPCTargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
+}
+
bool
PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
EVT VT , unsigned DefinedValues) const {
if (VT == MVT::v2i64)
- return false;
+ return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
if (Subtarget.hasQPX()) {
if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1)
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 6e13533cfdb3..44bcb8942cfc 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -79,6 +79,11 @@ namespace llvm {
/// compute an allocation on the stack.
DYNALLOC,
+ /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+ /// compute an offset from native SP to the address of the most recent
+ /// dynamic alloca.
+ DYNAREAOFFSET,
+
/// GlobalBaseReg - On Darwin, this node represents the result of the mflr
/// at function entry, used for PIC code.
GlobalBaseReg,
@@ -423,6 +428,8 @@ namespace llvm {
/// DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ bool useSoftFloat() const override;
+
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
return MVT::i32;
}
@@ -655,8 +662,17 @@ namespace llvm {
return Ty->isArrayTy();
}
- private:
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+ private:
struct ReuseLoadInfo {
SDValue Ptr;
SDValue Chain;
@@ -719,6 +735,8 @@ namespace llvm {
const PPCSubtarget &Subtarget) const;
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
const PPCSubtarget &Subtarget) const;
+ SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
const PPCSubtarget &Subtarget) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -853,7 +871,7 @@ namespace llvm {
bool &UseOneConstNR) const override;
SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const override;
- bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+ unsigned combineRepeatedFPDivisors() const override;
CCAssignFn *useFastISelCCs(unsigned Flag) const;
};
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index d62833037db5..075e093e41a1 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in
def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
[(set i64:$result,
(PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+ [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
let Defs = [LR8] in {
def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index d4e666cc1f3e..c17603a7718a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -144,6 +144,9 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
UseMI, UseIdx);
+ if (!DefMI->getParent())
+ return Latency;
+
const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
unsigned Reg = DefMO.getReg();
@@ -186,6 +189,60 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
return Latency;
}
+// This function does not list all associative and commutative operations, but
+// only those worth feeding through the machine combiner in an attempt to
+// reduce the critical path. Mostly, this means floating-point operations,
+// because they have high latencies (compared to other operations, such and
+// and/or, which are also associative and commutative, but have low latencies).
+bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ // FP Add:
+ case PPC::FADD:
+ case PPC::FADDS:
+ // FP Multiply:
+ case PPC::FMUL:
+ case PPC::FMULS:
+ // Altivec Add:
+ case PPC::VADDFP:
+ // VSX Add:
+ case PPC::XSADDDP:
+ case PPC::XVADDDP:
+ case PPC::XVADDSP:
+ case PPC::XSADDSP:
+ // VSX Multiply:
+ case PPC::XSMULDP:
+ case PPC::XVMULDP:
+ case PPC::XVMULSP:
+ case PPC::XSMULSP:
+ // QPX Add:
+ case PPC::QVFADD:
+ case PPC::QVFADDS:
+ case PPC::QVFADDSs:
+ // QPX Multiply:
+ case PPC::QVFMUL:
+ case PPC::QVFMULS:
+ case PPC::QVFMULSs:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool PPCInstrInfo::getMachineCombinerPatterns(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ // Using the machine combiner in this way is potentially expensive, so
+ // restrict to when aggressive optimizations are desired.
+ if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
+ return false;
+
+ // FP reassociation is only legal when we don't need strict IEEE semantics.
+ if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
+ return false;
+
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
@@ -259,16 +316,16 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
return 0;
}
-// commuteInstruction - We can commute rlwimi instructions, but only if the
-// rotate amt is zero. We also have to munge the immediates a bit.
-MachineInstr *
-PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
MachineFunction &MF = *MI->getParent()->getParent();
// Normal instructions can be commuted the obvious way.
if (MI->getOpcode() != PPC::RLWIMI &&
MI->getOpcode() != PPC::RLWIMIo)
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
// changing the relative order of the mask operands might change what happens
@@ -286,6 +343,8 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
// Op0 = (Op2 & ~M) | (Op1 & M)
// Swap op1/op2
+ assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
+ "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
unsigned Reg0 = MI->getOperand(0).getReg();
unsigned Reg1 = MI->getOperand(1).getReg();
unsigned Reg2 = MI->getOperand(2).getReg();
@@ -353,9 +412,9 @@ bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
if (AltOpc == -1)
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
- SrcOpIdx1 = 2;
- SrcOpIdx2 = 3;
- return true;
+ // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
+ // and SrcOpIdx2.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
}
void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -996,11 +1055,10 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MBB.insert(MI, NewMIs[i]);
const MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
NewMIs.back()->addMemOperand(MF, MMO);
}
@@ -1109,11 +1167,10 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MBB.insert(MI, NewMIs[i]);
const MachineFrameInfo &MFI = *MF.getFrameInfo();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdx),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlignment(FrameIdx));
NewMIs.back()->addMemOperand(MF, MMO);
}
@@ -1214,7 +1271,7 @@ bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumT, unsigned ExtraT,
MachineBasicBlock &FMBB,
unsigned NumF, unsigned ExtraF,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
}
@@ -1691,13 +1748,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
MI->setDesc(NewDesc);
if (NewDesc.ImplicitDefs)
- for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs();
+ for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
*ImpDefs; ++ImpDefs)
if (!MI->definesRegister(*ImpDefs))
MI->addOperand(*MI->getParent()->getParent(),
MachineOperand::CreateReg(*ImpDefs, true, true));
if (NewDesc.ImplicitUses)
- for (const uint16_t *ImpUses = NewDesc.getImplicitUses();
+ for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
*ImpUses; ++ImpUses)
if (!MI->readsRegister(*ImpUses))
MI->addOperand(*MI->getParent()->getParent(),
@@ -1737,3 +1794,35 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
}
}
+std::pair<unsigned, unsigned>
+PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = PPCII::MO_ACCESS_MASK;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace PPCII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_LO, "ppc-lo"},
+ {MO_HA, "ppc-ha"},
+ {MO_TPREL_LO, "ppc-tprel-lo"},
+ {MO_TPREL_HA, "ppc-tprel-ha"},
+ {MO_DTPREL_LO, "ppc-dtprel-lo"},
+ {MO_TLSLD_LO, "ppc-tlsld-lo"},
+ {MO_TOC_LO, "ppc-toc-lo"},
+ {MO_TLS, "ppc-tls"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace PPCII;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_PLT_OR_STUB, "ppc-plt-or-stub"},
+ {MO_PIC_FLAG, "ppc-pic"},
+ {MO_NLP_FLAG, "ppc-nlp"},
+ {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
+ return makeArrayRef(TargetFlags);
+}
+
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 40badae644d6..c3c3a480a6aa 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -79,6 +79,23 @@ class PPCInstrInfo : public PPCGenInstrInfo {
SmallVectorImpl<MachineInstr*> &NewMIs,
bool &NonRI, bool &SpillsVRS) const;
virtual void anchor();
+
+protected:
+ /// Commutes the operands in the given instruction.
+ /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+ ///
+ /// Do not call this method for a non-commutable instruction or for
+ /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ ///
+ /// For example, we can commute rlwimi instructions, but only if the
+ /// rotate amt is zero. We also have to munge the immediates a bit.
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
public:
explicit PPCInstrInfo(PPCSubtarget &STI);
@@ -119,6 +136,19 @@ public:
return false;
}
+ bool useMachineCombiner() const override {
+ return true;
+ }
+
+ /// Return true when there is potentially a faster code sequence
+ /// for an instruction chain ending in <Root>. All potential patterns are
+ /// output in the <Pattern> array.
+ bool getMachineCombinerPatterns(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &P) const override;
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
bool isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const override;
@@ -127,10 +157,6 @@ public:
unsigned isStoreToStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;
- // commuteInstruction - We can commute rlwimi instructions, but only if the
- // rotate amt is zero. We also have to munge the immediates a bit.
- MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
-
bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
@@ -183,7 +209,7 @@ public:
// profitable to use the predicated branches.
bool isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCycles, unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override {
+ BranchProbability Probability) const override {
return true;
}
@@ -191,12 +217,10 @@ public:
unsigned NumT, unsigned ExtraT,
MachineBasicBlock &FMBB,
unsigned NumF, unsigned ExtraF,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
- bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
- unsigned NumCycles,
- const BranchProbability
- &Probability) const override {
+ bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+ BranchProbability Probability) const override {
return true;
}
@@ -239,6 +263,15 @@ public:
unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
};
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 24fd9bd5c1f7..6c4364aad331 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
// Instructions to support dynamic alloca.
def SDTDynOp : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp : SDTypeProfile<1, 1, []>;
def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
//===----------------------------------------------------------------------===//
// PowerPC specific transformation functions and pattern fragments.
@@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in
def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
[(set i32:$result,
(PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+ [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
// instruction selection into a branch sequence.
@@ -3883,8 +3887,11 @@ def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0,
def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
-def : InstAlias<"cntlz $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
-def : InstAlias<"cntlz. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+// The POWER variant
+def : MnemonicAlias<"cntlz", "cntlzw">;
+def : MnemonicAlias<"cntlz.", "cntlzw.">;
def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
(ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
diff --git a/lib/Target/PowerPC/PPCInstrQPX.td b/lib/Target/PowerPC/PPCInstrQPX.td
index 0a044c5c6ea4..43120070d799 100644
--- a/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/lib/Target/PowerPC/PPCInstrQPX.td
@@ -839,31 +839,31 @@ def : Pat<(v4f64 (scalar_to_vector f64:$A)),
def : Pat<(v4f32 (scalar_to_vector f32:$A)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 0)),
+def : Pat<(f64 (extractelt v4f64:$S, 0)),
(EXTRACT_SUBREG $S, sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+def : Pat<(f32 (extractelt v4f32:$S, 0)),
(EXTRACT_SUBREG $S, sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 1)),
+def : Pat<(f64 (extractelt v4f64:$S, 1)),
(EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 2)),
+def : Pat<(f64 (extractelt v4f64:$S, 2)),
(EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, 3)),
+def : Pat<(f64 (extractelt v4f64:$S, 3)),
(EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+def : Pat<(f32 (extractelt v4f32:$S, 1)),
(EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+def : Pat<(f32 (extractelt v4f32:$S, 2)),
(EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+def : Pat<(f32 (extractelt v4f32:$S, 3)),
(EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
-def : Pat<(f64 (vector_extract v4f64:$S, i64:$F)),
+def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
(EXTRACT_SUBREG (QVFPERM $S, $S,
(QVLPCLSXint (RLDICR $F, 2,
/* 63-2 = */ 61))),
sub_64)>;
-def : Pat<(f32 (vector_extract v4f32:$S, i64:$F)),
+def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
(EXTRACT_SUBREG (QVFPERMs $S, $S,
(QVLPCLSXint (RLDICR $F, 2,
/* 63-2 = */ 61))),
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index ce63c22992e8..df1142cb42f3 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -67,17 +67,19 @@ def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
-multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
- string asmbase, string asmstr, InstrItinClass itin,
- list<dag> pattern> {
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
+ string asmstr, InstrItinClass itin, Intrinsic Int,
+ ValueType OutTy, ValueType InTy> {
let BaseName = asmbase in {
- def NAME : XX3Form_Rc<opcode, xo, OOL, IOL,
+ def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
!strconcat(asmbase, !strconcat(" ", asmstr)), itin,
- pattern>;
+ [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>;
let Defs = [CR6] in
- def o : XX3Form_Rc<opcode, xo, OOL, IOL,
+ def o : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
- []>, isDOT;
+ [(set InTy:$XT,
+ (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+ isDOT;
}
}
@@ -456,35 +458,23 @@ let Uses = [RM] in {
"xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
defm XVCMPEQDP : XX3Form_Rcr<60, 99,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v2i64:$XT,
- (int_ppc_vsx_xvcmpeqdp v2f64:$XA, v2f64:$XB))]>;
+ int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
defm XVCMPEQSP : XX3Form_Rcr<60, 67,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v4i32:$XT,
- (int_ppc_vsx_xvcmpeqsp v4f32:$XA, v4f32:$XB))]>;
+ int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>;
defm XVCMPGEDP : XX3Form_Rcr<60, 115,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v2i64:$XT,
- (int_ppc_vsx_xvcmpgedp v2f64:$XA, v2f64:$XB))]>;
+ int_ppc_vsx_xvcmpgedp, v2i64, v2f64>;
defm XVCMPGESP : XX3Form_Rcr<60, 83,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v4i32:$XT,
- (int_ppc_vsx_xvcmpgesp v4f32:$XA, v4f32:$XB))]>;
+ int_ppc_vsx_xvcmpgesp, v4i32, v4f32>;
defm XVCMPGTDP : XX3Form_Rcr<60, 107,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v2i64:$XT,
- (int_ppc_vsx_xvcmpgtdp v2f64:$XA, v2f64:$XB))]>;
+ int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>;
defm XVCMPGTSP : XX3Form_Rcr<60, 75,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare,
- [(set v4i32:$XT,
- (int_ppc_vsx_xvcmpgtsp v4f32:$XA, v4f32:$XB))]>;
+ int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
// Move Instructions
def XSABSDP : XX2Form<60, 345,
@@ -845,9 +835,9 @@ let Predicates = [IsBigEndian] in {
def : Pat<(v2f64 (scalar_to_vector f64:$A)),
(v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
(f64 (EXTRACT_SUBREG $S, sub_64))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
(f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
}
@@ -856,9 +846,9 @@ def : Pat<(v2f64 (scalar_to_vector f64:$A)),
(v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
(SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
(f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
-def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
(f64 (EXTRACT_SUBREG $S, sub_64))>;
}
@@ -1206,6 +1196,23 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
AltVSXFMARel;
}
+
+ // Single Precision Conversions (FP <-> INT)
+ def XSCVSXDSP : XX2Form<60, 312,
+ (outs vssrc:$XT), (ins vsfrc:$XB),
+ "xscvsxdsp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+ def XSCVUXDSP : XX2Form<60, 296,
+ (outs vssrc:$XT), (ins vsfrc:$XB),
+ "xscvuxdsp $XT, $XB", IIC_VecFP,
+ [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+
+ // Conversions between vector and scalar single precision
+ def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
+ "xscvdpspn $XT, $XB", IIC_VecFP, []>;
+ def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
+ "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+
} // AddedComplexity = 400
} // HasP8Vector
@@ -1229,3 +1236,550 @@ let Predicates = [HasDirectMove, HasVSX] in {
"mtvsrwz $XT, $rA", IIC_VecGeneral,
[(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
} // HasDirectMove, HasVSX
+
+/* Direct moves of various widths from GPR's into VSR's. Each move lines
+ the value up into element 0 (both BE and LE). Namely, entities smaller than
+ a doubleword are shifted left and moved for BE. For LE, they're moved, then
+ swapped to go into the least significant element of the VSR.
+*/
+def MovesToVSR {
+ dag BE_BYTE_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
+ dag BE_HALF_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
+ dag BE_WORD_0 =
+ (MTVSRD
+ (RLDICR
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
+ dag BE_DWORD_0 = (MTVSRD $A);
+
+ dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
+ dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ LE_MTVSRW, sub_64));
+ dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
+ dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+ BE_DWORD_0, sub_64));
+ dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
+}
+
+/* Patterns for extracting elements out of vectors. Integer elements are
+ extracted using direct move operations. Patterns for extracting elements
+ whose indices are not available at compile time are also provided with
+ various _VARIABLE_ patterns.
+ The numbering for the DAG's is for LE, but when used on BE, the correct
+ LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
+*/
+def VectorExtractions {
+ // Doubleword extraction
+ dag LE_DWORD_0 =
+ (MFVSRD
+ (EXTRACT_SUBREG
+ (XXPERMDI (COPY_TO_REGCLASS $S, VSRC),
+ (COPY_TO_REGCLASS $S, VSRC), 2), sub_64));
+ dag LE_DWORD_1 = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+
+ // Word extraction
+ dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64));
+ dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
+ dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+ dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64));
+
+ // Halfword extraction
+ dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32));
+ dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32));
+ dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32));
+ dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32));
+ dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32));
+ dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32));
+ dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32));
+ dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32));
+
+ // Byte extraction
+ dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32));
+ dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32));
+ dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32));
+ dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32));
+ dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32));
+ dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32));
+ dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32));
+ dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32));
+ dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32));
+ dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32));
+ dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32));
+ dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32));
+ dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32));
+ dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32));
+ dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32));
+ dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32));
+
+ /* Variable element number (BE and LE patterns must be specified separately)
+ This is a rather involved process.
+
+ Conceptually, this is how the move is accomplished:
+ 1. Identify which doubleword contains the element
+ 2. Shift in the VMX register so that the correct doubleword is correctly
+ lined up for the MFVSRD
+ 3. Perform the move so that the element (along with some extra stuff)
+ is in the GPR
+ 4. Right shift within the GPR so that the element is right-justified
+
+ Of course, the index is an element number which has a different meaning
+ on LE/BE so the patterns have to be specified separately.
+
+ Note: The final result will be the element right-justified with high
+ order bits being arbitrarily defined (namely, whatever was in the
+ vector register to the left of the value originally).
+ */
+
+ /* LE variable byte
+ Number 1. above:
+ - For elements 0-7, we shift left by 8 bytes since they're on the right
+ - For elements 8-15, we need not shift (shift left by zero bytes)
+ This is accomplished by inverting the bits of the index and AND-ing
+ with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
+ */
+ dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VBYTE = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-7 (8-15 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 8 as we need to shift right by the number of bits, not bytes
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60),
+ sub_32);
+ dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
+ sub_32);
+
+ /* LE variable halfword
+ Number 1. above:
+ - For elements 0-3, we shift left by 8 since they're on the right
+ - For elements 4-7, we need not shift (shift left by zero bytes)
+ Similarly to the byte pattern, we invert the bits of the index, but we
+ AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
+ Of course, the shift is still by 8 bytes, so we must multiply by 2.
+ */
+ dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VHALF = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-3 (4-7 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 16 as we need to shift right by the number of bits
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59),
+ sub_32);
+ dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
+ sub_32);
+
+ /* LE variable word
+ Number 1. above:
+ - For elements 0-1, we shift left by 8 since they're on the right
+ - For elements 2-3, we need not shift
+ */
+ dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ dag LE_MV_VWORD = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* Number 4. above:
+ - Truncate the element number to the range 0-1 (2-3 are symmetrical
+ and out of range values are truncated accordingly)
+ - Multiply by 32 as we need to shift right by the number of bits
+ - Shift right in the GPR by the calculated value
+ */
+ dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+ sub_32);
+ dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+ sub_32);
+
+ /* LE variable doubleword
+ Number 1. above:
+ - For element 0, we shift left by 8 since it's on the right
+ - For element 1, we need not shift
+ */
+ dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+ // Number 2. above:
+ // - Now that we set up the shift amount, we shift in the VMX register
+ dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+ // Number 3. above:
+ // - The doubleword containing our element is moved to a GPR
+ // - Number 4. is not needed for the doubleword as the value is 64-bits
+ dag LE_VARIABLE_DWORD =
+ (MFVSRD (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* LE variable float
+ - Shift the vector to line up the desired element to BE Word 0
+ - Convert 32-bit float to a 64-bit single precision float
+ */
+ dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+ dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+ dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+ /* LE variable double
+ Same as the LE doubleword except there is no move.
+ */
+ dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+ (COPY_TO_REGCLASS $S, VRRC),
+ LE_VDWORD_PERM_VEC);
+ dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+ /* BE variable byte
+ The algorithm here is the same as the LE variable byte except:
+ - The shift in the VMX register is by 0/8 for opposite element numbers so
+ we simply AND the element number with 0x8
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-7
+ */
+ dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+ dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+ dag BE_MV_VBYTE = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+ sub_32);
+ dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+ sub_32);
+
+ /* BE variable halfword
+ The algorithm here is the same as the LE variable halfword except:
+ - The shift in the VMX register is by 0/8 for opposite element numbers so
+ we simply AND the element number with 0x4 and multiply by 2
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-3
+ */
+ dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
+ dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+ dag BE_MV_VHALF = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
+ sub_32);
+ dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
+ sub_32);
+
+ /* BE variable word
+ The algorithm is the same as the LE variable word except:
+ - The shift in the VMX register happens for opposite element numbers
+ - The order of elements after the move to GPR is reversed, so we invert
+ the bits of the index prior to truncating to the range 0-1
+ */
+ dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+ dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+ dag BE_MV_VWORD = (MFVSRD
+ (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+ sub_64));
+ dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+ sub_32);
+ dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+ sub_32);
+
+ /* BE variable doubleword
+ Same as the LE doubleword except we shift in the VMX register for opposite
+ element indices.
+ */
+ dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+ dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+ dag BE_VARIABLE_DWORD =
+ (MFVSRD (EXTRACT_SUBREG
+ (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+ sub_64));
+
+ /* BE variable float
+ - Shift the vector to line up the desired element to BE Word 0
+ - Convert 32-bit float to a 64-bit single precision float
+ */
+ dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+ dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+ dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+ /* BE variable double
+ Same as the BE doubleword except there is no move.
+ */
+ dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+ (COPY_TO_REGCLASS $S, VRRC),
+ BE_VDWORD_PERM_VEC);
+ dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+}
+
+// v4f32 scalar <-> vector conversions (BE)
+let Predicates = [IsBigEndian, HasP8Vector] in {
+ def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+ (v4f32 (XSCVDPSPN $A))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+ (f32 (XSCVSPDPN $S))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+ (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+} // IsBigEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+ def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+ (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsBigEndian, HasDirectMove] in {
+ // v16i8 scalar <-> vector conversions (BE)
+ def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+ (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
+ def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+ (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
+ def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+ (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+ def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+ (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 VectorExtractions.LE_BYTE_15)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 VectorExtractions.LE_BYTE_14)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 VectorExtractions.LE_BYTE_13)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 VectorExtractions.LE_BYTE_12)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 VectorExtractions.LE_BYTE_11)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 VectorExtractions.LE_BYTE_10)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 VectorExtractions.LE_BYTE_9)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 VectorExtractions.LE_BYTE_8)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 VectorExtractions.LE_BYTE_7)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 VectorExtractions.LE_BYTE_6)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 VectorExtractions.LE_BYTE_5)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 VectorExtractions.LE_BYTE_4)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 VectorExtractions.LE_BYTE_3)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 VectorExtractions.LE_BYTE_2)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 VectorExtractions.LE_BYTE_1)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 VectorExtractions.LE_BYTE_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
+
+ // v8i16 scalar <-> vector conversions (BE)
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 VectorExtractions.LE_HALF_7)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 VectorExtractions.LE_HALF_6)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 VectorExtractions.LE_HALF_5)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 VectorExtractions.LE_HALF_4)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 VectorExtractions.LE_HALF_3)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 VectorExtractions.LE_HALF_2)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 VectorExtractions.LE_HALF_1)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+ (i32 VectorExtractions.LE_HALF_0)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_HALF)>;
+
+ // v4i32 scalar <-> vector conversions (BE)
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 VectorExtractions.LE_WORD_3)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 VectorExtractions.LE_WORD_1)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 VectorExtractions.LE_WORD_0)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 VectorExtractions.BE_VARIABLE_WORD)>;
+
+ // v2i64 scalar <-> vector conversions (BE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+// v4f32 scalar <-> vector conversions (LE)
+let Predicates = [IsLittleEndian, HasP8Vector] in {
+ def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+ (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+ (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+ (f32 (XSCVSPDPN $S))>;
+ def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+ (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
+} // IsLittleEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+ def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+ (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsLittleEndian, HasDirectMove] in {
+ // v16i8 scalar <-> vector conversions (LE)
+ def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+ (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+ def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+ (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+ def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+ (v4i32 MovesToVSR.LE_WORD_0)>;
+ def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+ (v2i64 MovesToVSR.LE_DWORD_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+ (i32 VectorExtractions.LE_BYTE_0)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+ (i32 VectorExtractions.LE_BYTE_1)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+ (i32 VectorExtractions.LE_BYTE_2)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+ (i32 VectorExtractions.LE_BYTE_3)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+ (i32 VectorExtractions.LE_BYTE_4)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+ (i32 VectorExtractions.LE_BYTE_5)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+ (i32 VectorExtractions.LE_BYTE_6)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+ (i32 VectorExtractions.LE_BYTE_7)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+ (i32 VectorExtractions.LE_BYTE_8)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+ (i32 VectorExtractions.LE_BYTE_9)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+ (i32 VectorExtractions.LE_BYTE_10)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+ (i32 VectorExtractions.LE_BYTE_11)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+ (i32 VectorExtractions.LE_BYTE_12)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+ (i32 VectorExtractions.LE_BYTE_13)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+ (i32 VectorExtractions.LE_BYTE_14)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+ (i32 VectorExtractions.LE_BYTE_15)>;
+ def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
+
+ // v8i16 scalar <-> vector conversions (LE)
+ def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+ (i32 VectorExtractions.LE_HALF_0)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+ (i32 VectorExtractions.LE_HALF_1)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+ (i32 VectorExtractions.LE_HALF_2)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+ (i32 VectorExtractions.LE_HALF_3)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+ (i32 VectorExtractions.LE_HALF_4)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+ (i32 VectorExtractions.LE_HALF_5)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+ (i32 VectorExtractions.LE_HALF_6)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+ (i32 VectorExtractions.LE_HALF_7)>;
+ def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_HALF)>;
+
+ // v4i32 scalar <-> vector conversions (LE)
+ def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+ (i32 VectorExtractions.LE_WORD_0)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+ (i32 VectorExtractions.LE_WORD_1)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+ (i32 VectorExtractions.LE_WORD_2)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+ (i32 VectorExtractions.LE_WORD_3)>;
+ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+ (i32 VectorExtractions.LE_VARIABLE_WORD)>;
+
+ // v2i64 scalar <-> vector conversions (LE)
+ def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+ (i64 VectorExtractions.LE_DWORD_0)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+ (i64 VectorExtractions.LE_DWORD_1)>;
+ def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+ (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [HasDirectMove, HasVSX] in {
+// bitconvert f32 -> i32
+// (convert to 32-bit fp single, shift right 1 word, move to GPR)
+def : Pat<(i32 (bitconvert f32:$S)),
+ (i32 (MFVSRWZ (EXTRACT_SUBREG
+ (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+ sub_64)))>;
+// bitconvert i32 -> f32
+// (move to FPR, shift left 1 word, convert to 64-bit fp single)
+def : Pat<(f32 (bitconvert i32:$A)),
+ (f32 (XSCVSPDPN
+ (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
+
+// bitconvert f64 -> i64
+// (move to GPR, nothing else needed)
+def : Pat<(i64 (bitconvert f64:$S)),
+ (i64 (MFVSRD $S))>;
+
+// bitconvert i64 -> f64
+// (move to FPR, nothing else needed)
+def : Pat<(f64 (bitconvert i64:$S)),
+ (f64 (MTVSRD $S))>;
+}
diff --git a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
index b4e1c099f190..e3a35d5df358 100644
--- a/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
+++ b/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp
@@ -21,6 +21,7 @@
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -71,10 +72,10 @@ namespace {
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolution>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
// FIXME: For some reason, preserving SE here breaks LSR (even if
// this pass changes nothing).
- // AU.addPreserved<ScalarEvolution>();
+ // AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
@@ -96,7 +97,7 @@ INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch",
"PPC Loop Data Prefetch", false, false)
@@ -104,7 +105,7 @@ FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPref
bool PPCLoopDataPrefetch::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DL = &F.getParent()->getDataLayout();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index b6e7799402e1..5e188268fee9 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -73,7 +73,7 @@ namespace {
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolution>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
}
bool runOnFunction(Function &F) override;
@@ -84,8 +84,10 @@ namespace {
private:
PPCTargetMachine *TM;
+ DominatorTree *DT;
LoopInfo *LI;
ScalarEvolution *SE;
+ bool PreserveLCSSA;
};
}
@@ -93,7 +95,7 @@ char PPCLoopPreIncPrep::ID = 0;
static const char *name = "Prepare loop for pre-inc. addressing modes";
INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
@@ -101,17 +103,20 @@ FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
}
namespace {
- struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool>
- {
- SCEVLess(ScalarEvolution *SE) : SE(SE) {}
+ struct BucketElement {
+ BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+ BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
- bool operator() (const SCEV *X, const SCEV *Y) const {
- const SCEV *Diff = SE->getMinusSCEV(X, Y);
- return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0;
- }
+ const SCEVConstant *Offset;
+ Instruction *Instr;
+ };
- protected:
- ScalarEvolution *SE;
+ struct Bucket {
+ Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+ Elements(1, BucketElement(I)) {}
+
+ const SCEV *BaseSCEV;
+ SmallVector<BucketElement, 16> Elements;
};
}
@@ -140,7 +145,10 @@ static Value *GetPointerOperand(Value *MemI) {
bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
bool MadeChange = false;
@@ -169,7 +177,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
std::distance(pred_begin(Header), pred_end(Header));
// Collect buckets of comparable addresses used by loads and stores.
- typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket;
SmallVector<Bucket, 16> Buckets;
for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
I != IE; ++I) {
@@ -212,25 +219,24 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
}
bool FoundBucket = false;
- for (unsigned i = 0, e = Buckets.size(); i != e; ++i)
- for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end();
- K != KE; ++K) {
- const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV);
- if (isa<SCEVConstant>(Diff)) {
- Buckets[i].insert(std::make_pair(LSCEV, MemI));
- FoundBucket = true;
- break;
- }
+ for (auto &B : Buckets) {
+ const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+ if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+ B.Elements.push_back(BucketElement(CDiff, MemI));
+ FoundBucket = true;
+ break;
}
+ }
if (!FoundBucket) {
- Buckets.push_back(Bucket(SCEVLess(SE)));
- Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI));
+ if (Buckets.size() == MaxVars)
+ return MadeChange;
+ Buckets.push_back(Bucket(LSCEV, MemI));
}
}
}
- if (Buckets.empty() || Buckets.size() > MaxVars)
+ if (Buckets.empty())
return MadeChange;
BasicBlock *LoopPredecessor = L->getLoopPredecessor();
@@ -239,7 +245,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
// iteration space), insert a new preheader for the loop.
if (!LoopPredecessor ||
!LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
- LoopPredecessor = InsertPreheaderForLoop(L, this);
+ LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
if (LoopPredecessor)
MadeChange = true;
}
@@ -253,8 +259,45 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
// The base address of each bucket is transformed into a phi and the others
// are rewritten as offsets of that variable.
+ // We have a choice now of which instruction's memory operand we use as the
+ // base for the generated PHI. Always picking the first instruction in each
+ // bucket does not work well, specifically because that instruction might
+ // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+ // the choice is somewhat arbitrary, because the backend will happily
+ // generate direct offsets from both the pre-incremented and
+ // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+ // instruction in each bucket, and adjust the recurrence and other offsets
+ // accordingly.
+ for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
+ if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
+ if (II->getIntrinsicID() == Intrinsic::prefetch)
+ continue;
+
+ // If we'd otherwise pick the first element anyway, there's nothing to do.
+ if (j == 0)
+ break;
+
+ // If our chosen element has no offset from the base pointer, there's
+ // nothing to do.
+ if (!Buckets[i].Elements[j].Offset ||
+ Buckets[i].Elements[j].Offset->isZero())
+ break;
+
+ const SCEV *Offset = Buckets[i].Elements[j].Offset;
+ Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
+ for (auto &E : Buckets[i].Elements) {
+ if (E.Offset)
+ E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+ else
+ E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+ }
+
+ std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
+ break;
+ }
+
const SCEVAddRecExpr *BasePtrSCEV =
- cast<SCEVAddRecExpr>(Buckets[i].begin()->first);
+ cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
if (!BasePtrSCEV->isAffine())
continue;
@@ -262,7 +305,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
assert(BasePtrSCEV->getLoop() == L &&
"AddRec for the wrong loop?");
- Instruction *MemI = Buckets[i].begin()->second;
+ // The instruction corresponding to the Bucket's BaseSCEV must be the first
+ // in the vector of elements.
+ Instruction *MemI = Buckets[i].Elements.begin()->Instr;
Value *BasePtr = GetPointerOperand(MemI);
assert(BasePtr && "No pointer operand");
@@ -302,7 +347,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
}
- Instruction *InsPoint = Header->getFirstInsertionPt();
+ Instruction *InsPoint = &*Header->getFirstInsertionPt();
GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
@@ -327,18 +372,20 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
BasePtr->replaceAllUsesWith(NewBasePtr);
RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
- Value *LastNewPtr = NewBasePtr;
- for (Bucket::iterator I = std::next(Buckets[i].begin()),
- IE = Buckets[i].end(); I != IE; ++I) {
- Value *Ptr = GetPointerOperand(I->second);
+ // Keep track of the replacement pointer values we've inserted so that we
+ // don't generate more pointer values than necessary.
+ SmallPtrSet<Value *, 16> NewPtrs;
+ NewPtrs.insert( NewBasePtr);
+
+ for (auto I = std::next(Buckets[i].Elements.begin()),
+ IE = Buckets[i].Elements.end(); I != IE; ++I) {
+ Value *Ptr = GetPointerOperand(I->Instr);
assert(Ptr && "No pointer operand");
- if (Ptr == LastNewPtr)
+ if (NewPtrs.count(Ptr))
continue;
Instruction *RealNewPtr;
- const SCEVConstant *Diff =
- cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV));
- if (Diff->isZero()) {
+ if (!I->Offset || I->Offset->getValue()->isZero()) {
RealNewPtr = NewBasePtr;
} else {
Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
@@ -346,13 +393,13 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
PtrIP = 0;
else if (isa<PHINode>(PtrIP))
- PtrIP = PtrIP->getParent()->getFirstInsertionPt();
+ PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
else if (!PtrIP)
- PtrIP = I->second;
+ PtrIP = I->Instr;
GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
- I8Ty, PtrInc, Diff->getValue(),
- I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP);
+ I8Ty, PtrInc, I->Offset->getValue(),
+ I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
if (!PtrIP)
NewPtr->insertAfter(cast<Instruction>(PtrInc));
NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
@@ -373,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
Ptr->replaceAllUsesWith(ReplNewPtr);
RecursivelyDeleteTriviallyDeadInstructions(Ptr);
- LastNewPtr = RealNewPtr;
+ NewPtrs.insert(RealNewPtr);
}
MadeChange = true;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 76837ecb32de..44a692d4bb42 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -38,7 +38,7 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
const TargetMachine &TM = AP.TM;
Mangler *Mang = AP.Mang;
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = AP.getDataLayout();
MCContext &Ctx = AP.OutContext;
bool isDarwin = TM.getTargetTriple().isOSDarwin();
@@ -51,13 +51,13 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
Suffix = "$non_lazy_ptr";
if (!Suffix.empty())
- Name += DL->getPrivateGlobalPrefix();
+ Name += DL.getPrivateGlobalPrefix();
unsigned PrefixLen = Name.size();
if (!MO.isGlobal()) {
assert(MO.isSymbol() && "Isn't a symbol reference");
- Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL);
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
} else {
const GlobalValue *GV = MO.getGlobal();
TM.getNameWithPrefix(Name, GV, *Mang);
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
new file mode 100644
index 000000000000..fe339d70d7de
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -0,0 +1,230 @@
+//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to clean up ugly code
+// sequences at the MachineInstruction layer. It runs at the end of
+// the SSA phases, following VSX swap removal. A pass of dead code
+// elimination follows this one for quick clean-up of any dead
+// instructions introduced here. Although we could do this as callbacks
+// from the generic peephole pass, this would have a couple of bad
+// effects: it might remove optimization opportunities for VSX swap
+// removal, and it would miss cleanups made possible following VSX
+// swap removal.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-mi-peepholes"
+
+namespace llvm {
+ void initializePPCMIPeepholePass(PassRegistry&);
+}
+
+namespace {
+
+struct PPCMIPeephole : public MachineFunctionPass {
+
+ static char ID;
+ const PPCInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+
+ PPCMIPeephole() : MachineFunctionPass(ID) {
+ initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ // Perform peepholes.
+ bool simplifyCode(void);
+
+ // Find the "true" register represented by SrcReg (following chains
+ // of copies and subreg_to_reg operations).
+ unsigned lookThruCopyLike(unsigned SrcReg);
+
+public:
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ initialize(MF);
+ return simplifyCode();
+ }
+};
+
+// Initialize class variables.
+void PPCMIPeephole::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+ DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+ DEBUG(MF->dump());
+}
+
+// Perform peephole optimizations.
+bool PPCMIPeephole::simplifyCode(void) {
+ bool Simplified = false;
+ MachineInstr* ToErase = nullptr;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+
+ // If the previous instruction was marked for elimination,
+ // remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // Ignore debug instructions.
+ if (MI.isDebugValue())
+ continue;
+
+ // Per-opcode peepholes.
+ switch (MI.getOpcode()) {
+
+ default:
+ break;
+
+ case PPC::XXPERMDI: {
+ // Perform simplifications of 2x64 vector swaps and splats.
+ // A swap is identified by an immediate value of 2, and a splat
+ // is identified by an immediate value of 0 or 3.
+ int Immed = MI.getOperand(3).getImm();
+
+ if (Immed != 1) {
+
+ // For each of these simplifications, we need the two source
+ // regs to match. Unfortunately, MachineCSE ignores COPY and
+ // SUBREG_TO_REG, so for example we can see
+ // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
+ // We have to look through chains of COPY and SUBREG_TO_REG
+ // to find the real source values for comparison.
+ unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
+ unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+
+ if (TrueReg1 == TrueReg2
+ && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+ MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
+
+ // If this is a splat or a swap fed by another splat, we
+ // can replace it with a copy.
+ if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+ unsigned FeedImmed = DefMI->getOperand(3).getImm();
+ unsigned FeedReg1
+ = lookThruCopyLike(DefMI->getOperand(1).getReg());
+ unsigned FeedReg2
+ = lookThruCopyLike(DefMI->getOperand(2).getReg());
+
+ if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs()
+ << "Optimizing splat/swap or splat/splat "
+ "to splat/copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(),
+ TII->get(PPC::COPY), MI.getOperand(0).getReg())
+ .addOperand(MI.getOperand(1));
+ ToErase = &MI;
+ Simplified = true;
+ }
+
+ // If this is a splat fed by a swap, we can simplify modify
+ // the splat to splat the other value from the swap's input
+ // parameter.
+ else if ((Immed == 0 || Immed == 3)
+ && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+ DEBUG(MI.dump());
+ MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
+ MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
+ MI.getOperand(3).setImm(3 - Immed);
+ Simplified = true;
+ }
+
+ // If this is a swap fed by a swap, we can replace it
+ // with a copy from the first swap's input.
+ else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+ DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+ DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(),
+ TII->get(PPC::COPY), MI.getOperand(0).getReg())
+ .addOperand(DefMI->getOperand(1));
+ ToErase = &MI;
+ Simplified = true;
+ }
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ // If the last instruction was marked for elimination,
+ // remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+ }
+
+ return Simplified;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg). Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register. If a
+// physical register is encountered, we stop the search.
+unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+
+ while (true) {
+
+ MachineInstr *MI = MRI->getVRegDef(SrcReg);
+ if (!MI->isCopyLike())
+ return SrcReg;
+
+ unsigned CopySrcReg;
+ if (MI->isCopy())
+ CopySrcReg = MI->getOperand(1).getReg();
+ else {
+ assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+ CopySrcReg = MI->getOperand(2).getReg();
+ }
+
+ if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+ return CopySrcReg;
+
+ SrcReg = CopySrcReg;
+ }
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
+ "PowerPC MI Peephole Optimization", false, false)
+INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
+ "PowerPC MI Peephole Optimization", false, false)
+
+char PPCMIPeephole::ID = 0;
+FunctionPass*
+llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
+
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index ec4e0a5fa81b..95f163153c74 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -18,8 +18,8 @@ using namespace llvm;
void PPCFunctionInfo::anchor() { }
MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
- const DataLayout *DL = MF.getTarget().getDataLayout();
- return MF.getContext().getOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
+ const DataLayout &DL = MF.getDataLayout();
+ return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
Twine(MF.getFunctionNumber()) +
"$poff");
}
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2b09b2f625de..934bdf622418 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -200,7 +200,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(PPC::R2); // System-reserved register
Reserved.set(PPC::R13); // Small Data Area pointer register
}
-
+
// On PPC64, r13 is the thread pointer. Never allocate this register.
if (TM.isPPC64()) {
Reserved.set(PPC::R13);
@@ -262,7 +262,7 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
default:
return 0;
case PPC::G8RC_NOX0RegClassID:
- case PPC::GPRC_NOR0RegClassID:
+ case PPC::GPRC_NOR0RegClassID:
case PPC::G8RCRegClassID:
case PPC::GPRCRegClassID: {
unsigned FP = TFI->hasFP(MF) ? 1 : 0;
@@ -311,7 +311,7 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
//===----------------------------------------------------------------------===//
/// lowerDynamicAlloc - Generate the code for allocating an object in the
-/// current frame. The sequence of code with be in the general form
+/// current frame. The sequence of code will be in the general form
///
/// addi R0, SP, \#frameSize ; get the address of the previous frame
/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size
@@ -337,7 +337,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
// Get the total frame size.
unsigned FrameSize = MFI->getStackSize();
-
+
// Get stack alignments.
const PPCFrameLowering *TFI = getFrameLowering(MF);
unsigned TargetAlign = TFI->getStackAlignment();
@@ -347,14 +347,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
// Determine the previous frame's address. If FrameSize can't be
// represented as 16 bits or we need special alignment, then we load the
- // previous frame's address from 0(SP). Why not do an addis of the hi?
- // Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
- // Constructing the constant and adding would take 3 instructions.
+ // previous frame's address from 0(SP). Why not do an addis of the hi?
+ // Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
+ // Constructing the constant and adding would take 3 instructions.
// Fortunately, a frame greater than 32K is rare.
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
-
+
if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
.addReg(PPC::R31)
@@ -425,11 +425,32 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
.addReg(PPC::R1)
.addImm(maxCallFrameSize);
}
-
+
// Discard the DYNALLOC instruction.
MBB.erase(II);
}
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+ MachineBasicBlock::iterator II) const {
+ // Get the instruction.
+ MachineInstr &MI = *II;
+ // Get the instruction's basic block.
+ MachineBasicBlock &MBB = *MI.getParent();
+ // Get the basic block's function.
+ MachineFunction &MF = *MBB.getParent();
+ // Get the frame info.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+ // Get the instruction info.
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+ unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+ DebugLoc dl = MI.getDebugLoc();
+ BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+ .addImm(maxCallFrameSize);
+ MBB.erase(II);
+}
+
/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
/// reserving a whole register (R0), we scrounge for one here. This generates
/// code like this:
@@ -459,8 +480,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
// We need to store the CR in the low 4-bits of the saved value. First, issue
// an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
- .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-
+ .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
// If the saved register wasn't CR0, shift the bits left so that they are in
// CR0's slot.
if (SrcReg != PPC::CR0) {
@@ -549,8 +570,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
.addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
- .addReg(getCRFromCRBit(SrcReg));
-
+ .addReg(getCRFromCRBit(SrcReg));
+
// If the saved register wasn't CR0LT, shift the bits left so that the bit to
// store is the first one. Mask all but that bit.
unsigned Reg1 = Reg;
@@ -602,17 +623,19 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
unsigned ShiftBits = getEncodingValue(DestReg);
// rlwimi r11, r10, 32-ShiftBits, ..., ...
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
- .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill)
- .addImm(ShiftBits ? 32-ShiftBits : 0)
- .addImm(ShiftBits).addImm(ShiftBits);
-
+ .addReg(RegO, RegState::Kill)
+ .addReg(Reg, RegState::Kill)
+ .addImm(ShiftBits ? 32 - ShiftBits : 0)
+ .addImm(ShiftBits)
+ .addImm(ShiftBits);
+
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
getCRFromCRBit(DestReg))
- .addReg(RegO, RegState::Kill)
- // Make sure we have a use dependency all the way through this
- // sequence of instructions. We can't have the other bits in the CR
- // modified in between the mfocrf and the mtocrf.
- .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+ .addReg(RegO, RegState::Kill)
+ // Make sure we have a use dependency all the way through this
+ // sequence of instructions. We can't have the other bits in the CR
+ // modified in between the mfocrf and the mtocrf.
+ .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
// Discard the pseudo instruction.
MBB.erase(II);
@@ -634,11 +657,11 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
unsigned SrcReg = MI.getOperand(0).getReg();
BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
- .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-
- addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
- .addReg(Reg, RegState::Kill),
- FrameIndex);
+ .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+ addFrameReference(
+ BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
+ FrameIndex);
// Discard the pseudo instruction.
MBB.erase(II);
@@ -671,9 +694,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
MBB.erase(II);
}
-bool
-PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
- unsigned Reg, int &FrameIdx) const {
+bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+ unsigned Reg, int &FrameIdx) const {
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
// For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
// ABI, return true to prevent allocating an additional frame slot.
@@ -752,7 +774,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FPSI = FI->getFramePointerSaveIndex();
// Get the instruction opcode.
unsigned OpC = MI.getOpcode();
-
+
+ if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+ lowerDynamicAreaOffset(II);
+ return;
+ }
+
// Special case for dynamic alloca.
if (FPSI && FrameIndex == FPSI &&
(OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
@@ -800,8 +827,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we're not using a Frame Pointer that has been set to the value of the
// SP before having the stack size subtracted from it, then add the stack size
// to Offset to get the correct offset.
- // Naked functions have stack size 0, although getStackSize may not reflect that
- // because we didn't call all the pieces that compute it for naked functions.
+ // Naked functions have stack size 0, although getStackSize may not reflect
+ // that because we didn't call all the pieces that compute it for naked
+ // functions.
if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
if (!(hasBasePointer(MF) && FrameIndex < 0))
Offset += MFI->getStackSize();
@@ -840,7 +868,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
.addImm(Offset);
// Convert into indexed form of the instruction:
- //
+ //
// sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
// addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
unsigned OperandBase;
@@ -898,24 +926,6 @@ bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return needsStackRealignment(MF);
}
-bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
- return false;
-
- return true;
-}
-
-bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const PPCFrameLowering *TFI = getFrameLowering(MF);
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
- F->hasFnAttribute(Attribute::StackAlignment));
-
- return requiresRealignment && canRealignStack(MF);
-}
-
/// Returns true if the instruction's frame index
/// reference would be better served by a base register other than FP
/// or SP. Used by LocalStackFrameAllocation to determine which frame index
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index d304e1d8b5ec..b15fde83c9f3 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -54,13 +54,13 @@ inline static unsigned getCRFromCRBit(unsigned SrcReg) {
return Reg;
}
-
class PPCRegisterInfo : public PPCGenRegisterInfo {
DenseMap<unsigned, unsigned> ImmToIdxMap;
const PPCTargetMachine &TM;
+
public:
PPCRegisterInfo(const PPCTargetMachine &TM);
-
+
/// getPointerRegClass - Return the register class to use to hold pointers.
/// This is used for addressing modes.
const TargetRegisterClass *
@@ -77,7 +77,7 @@ public:
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const override;
- const uint32_t *getNoPreservedMask() const;
+ const uint32_t *getNoPreservedMask() const override;
void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
@@ -101,6 +101,7 @@ public:
}
void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+ void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
void lowerCRSpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;
void lowerCRRestore(MachineBasicBlock::iterator II,
@@ -115,9 +116,9 @@ public:
unsigned FrameIndex) const;
bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
- int &FrameIdx) const override;
- void eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, unsigned FIOperandNum,
+ int &FrameIdx) const override;
+ void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
// Support for virtual base registers.
@@ -136,8 +137,6 @@ public:
// Base pointer (stack realignment) support.
unsigned getBaseRegister(const MachineFunction &MF) const;
bool hasBasePointer(const MachineFunction &MF) const;
- bool canRealignStack(const MachineFunction &MF) const;
- bool needsStackRealignment(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 58daccae90f2..c0fcb6cbb9dc 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -62,6 +62,7 @@ void PPCSubtarget::initializeEnvironment() {
Has64BitSupport = false;
Use64BitRegs = false;
UseCRBits = false;
+ UseSoftFloat = false;
HasAltivec = false;
HasSPE = false;
HasQPX = false;
@@ -100,6 +101,8 @@ void PPCSubtarget::initializeEnvironment() {
HasDirectMove = false;
IsQPXStackUnaligned = false;
HasHTM = false;
+ HasFusion = false;
+ HasFloat128 = false;
}
void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -210,5 +213,33 @@ bool PPCSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
}
+unsigned char PPCSubtarget::classifyGlobalReference(
+ const GlobalValue *GV) const {
+ // Note that currently we don't generate non-pic references.
+ // If a caller wants that, this will have to be updated.
+
+ // Large code model always uses the TOC even for local symbols.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+
+ unsigned char flags = PPCII::MO_PIC_FLAG;
+
+ // Only if the relocation mode is PIC do we have to worry about
+ // interposition. In all other cases we can use a slightly looser standard to
+ // decide how to access the symbol.
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ // If it's local, or it's non-default, it can't be interposed.
+ if (!GV->hasLocalLinkage() &&
+ GV->hasDefaultVisibility()) {
+ flags |= PPCII::MO_NLP_FLAG;
+ }
+ return flags;
+ }
+
+ if (GV->isStrongDefinitionForLinker())
+ return flags;
+ return flags | PPCII::MO_NLP_FLAG;
+}
+
bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 0616c1f65604..4f5c95c1483f 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -83,6 +83,7 @@ protected:
bool Has64BitSupport;
bool Use64BitRegs;
bool UseCRBits;
+ bool UseSoftFloat;
bool IsPPC64;
bool HasAltivec;
bool HasSPE;
@@ -119,6 +120,8 @@ protected:
bool HasPartwordAtomics;
bool HasDirectMove;
bool HasHTM;
+ bool HasFusion;
+ bool HasFloat128;
/// When targeting QPX running a stock PPC64 Linux kernel where the stack
/// alignment has not been changed, we need to keep the 16-byte alignment
@@ -188,6 +191,8 @@ public:
/// has64BitSupport - Return true if the selected CPU supports 64-bit
/// instructions, regardless of whether we are in 32-bit or 64-bit mode.
bool has64BitSupport() const { return Has64BitSupport; }
+ // useSoftFloat - Return true if soft-float option is turned on.
+ bool useSoftFloat() const { return UseSoftFloat; }
/// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
/// registers in 32-bit mode when possible. This can only true if
@@ -254,6 +259,8 @@ public:
return 16;
}
bool hasHTM() const { return HasHTM; }
+ bool hasFusion() const { return HasFusion; }
+ bool hasFloat128() const { return HasFloat128; }
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -285,6 +292,10 @@ public:
bool useAA() const override;
bool enableSubRegLiveness() const override;
+
+ /// classifyGlobalReference - Classify a global variable reference for the
+ /// current subtarget accourding to how we should reference it.
+ unsigned char classifyGlobalReference(const GlobalValue *GV) const;
};
} // End llvm namespace
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 1daf244fed44..d24b590317f5 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -42,6 +42,10 @@ static cl::
opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
cl::desc("Disable VSX Swap Removal for PPC"));
+static cl::
+opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
+ cl::desc("Disable machine peepholes for PPC"));
+
static cl::opt<bool>
EnableGEPOpt("ppc-gep-opt", cl::Hidden,
cl::desc("Enable optimizations on complex GEPs"),
@@ -57,11 +61,19 @@ EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps",
cl::desc("Add extra TOC register dependencies"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+EnableMachineCombinerPass("ppc-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
extern "C" void LLVMInitializePowerPCTarget() {
// Register the targets
RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializePPCBoolRetToIntPass(PR);
}
/// Return the datalayout string of a subtarget.
@@ -118,7 +130,7 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
}
if (OL != CodeGenOpt::None) {
- if (!FullFS.empty())
+ if (!FullFS.empty())
FullFS = "+invariant-function-descriptors," + FullFS;
else
FullFS = "+invariant-function-descriptors";
@@ -144,7 +156,7 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
return PPCTargetMachine::PPC_ABI_ELFv2;
assert(Options.MCOptions.getABIName().empty() &&
- "Unknown target-abi option!");
+ "Unknown target-abi option!");
if (!TT.isMacOSX()) {
switch (TT.getArch()) {
@@ -160,9 +172,9 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
return PPCTargetMachine::PPC_ABI_UNKNOWN;
}
-// The FeatureString here is a little subtle. We are modifying the feature string
-// with what are (currently) non-function specific overrides as it goes into the
-// LLVMTargetMachine constructor and then using the stored value in the
+// The FeatureString here is a little subtle. We are modifying the feature
+// string with what are (currently) non-function specific overrides as it goes
+// into the LLVMTargetMachine constructor and then using the stored value in the
// Subtarget constructor below it.
PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -227,6 +239,19 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
? FSAttr.getValueAsString().str()
: TargetFS;
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.hasFnAttribute("use-soft-float") &&
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
auto &I = SubtargetMap[CPU + FS];
if (!I) {
// This needs to be done before we create a new subtarget since any
@@ -277,6 +302,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
}
void PPCPassConfig::addIRPasses() {
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createPPCBoolRetToIntPass());
addPass(createAtomicExpandPass(&getPPCTargetMachine()));
// For the BG/Q (or if explicitly requested), add explicit data prefetch
@@ -316,6 +343,10 @@ bool PPCPassConfig::addPreISel() {
bool PPCPassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);
+
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+
return true;
}
@@ -339,6 +370,12 @@ void PPCPassConfig::addMachineSSAOptimization() {
if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
!DisableVSXSwapRemoval)
addPass(createPPCVSXSwapRemovalPass());
+ // Target-specific peephole cleanups performed after instruction
+ // selection.
+ if (!DisableMIPeephole) {
+ addPass(createPPCMIPeepholePass());
+ addPass(&DeadMachineInstructionElimID);
+ }
}
void PPCPassConfig::addPreRegAlloc() {
@@ -364,6 +401,7 @@ void PPCPassConfig::addPreEmitPass() {
}
TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis(
- [this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); });
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(PPCTTIImpl(this, F));
+ });
}
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 9ee5db938b67..798bb9d6b892 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -42,9 +42,7 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
if (Kind.isReadOnly()) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
- if (GVar && GVar->isConstant() &&
- (GVar->getInitializer()->getRelocationInfo() ==
- Constant::GlobalRelocations))
+ if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation())
Kind = SectionKind::getReadOnlyWithRel();
}
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index e21c2b77f4d7..cd86dabd5abe 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -35,7 +35,7 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCost(Imm, Ty);
@@ -64,8 +64,8 @@ unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
return 4 * TTI::TCC_Basic;
}
-unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
@@ -98,8 +98,8 @@ unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
return PPCTTIImpl::getIntImmCost(Imm, Ty);
}
-unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
if (DisablePPCConstHoist)
return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
@@ -197,9 +197,20 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
}
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
+ // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
+ // on combining the loads generated for consecutive accesses, and failure to
+ // do so is particularly expensive. This makes it much more likely (compared
+ // to only using concatenation unrolling).
+ if (ST->getDarwinDirective() == PPC::DIR_A2)
+ return true;
+
return LoopHasReductions;
}
+bool PPCTTIImpl::enableInterleavedAccessVectorization() {
+ return true;
+}
+
unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
if (Vector && !ST->hasAltivec() && !ST->hasQPX())
return 0;
@@ -246,7 +257,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 2;
}
-unsigned PPCTTIImpl::getArithmeticInstrCost(
+int PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
@@ -257,24 +268,30 @@ unsigned PPCTTIImpl::getArithmeticInstrCost(
Opd1PropInfo, Opd2PropInfo);
}
-unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // (at least in the sense that there need only be one non-loop-invariant
+ // instruction). We need one such shuffle instruction for each actual
+ // register (this is not true for arbitrary shuffles, but is true for the
+ // structured types of shuffles covered by TTI::ShuffleKind).
+ return LT.first;
}
-unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -313,41 +330,83 @@ unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return BaseT::getVectorInstrCost(Opcode, Val, Index);
}
-unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) {
+int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Invalid Opcode");
- unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
-
- // VSX loads/stores support unaligned access.
- if (ST->hasVSX()) {
- if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
- return Cost;
- }
+ int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
- bool UnalignedAltivec =
- Src->isVectorTy() &&
- Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
- LT.second.getSizeInBits() == 128 &&
- Opcode == Instruction::Load;
+ // Aligned loads and stores are easy.
+ unsigned SrcBytes = LT.second.getStoreSize();
+ if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+ return Cost;
+
+ bool IsAltivecType = ST->hasAltivec() &&
+ (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+ bool IsVSXType = ST->hasVSX() &&
+ (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+ bool IsQPXType = ST->hasQPX() &&
+ (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+
+ // If we can use the permutation-based load sequence, then this is also
+ // relatively cheap (not counting loop-invariant instructions): one load plus
+ // one permute (the last load in a series has extra cost, but we're
+ // neglecting that here). Note that on the P7, we should do unaligned loads
+ // for Altivec types using the VSX instructions, but that's more expensive
+ // than using the permutation-based load sequence. On the P8, that's no
+ // longer true.
+ if (Opcode == Instruction::Load &&
+ ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+ Alignment >= LT.second.getScalarType().getStoreSize())
+ return Cost + LT.first; // Add the cost of the permutations.
+
+ // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
+ // P7, unaligned vector loads are more expensive than the permutation-based
+ // load sequence, so that might be used instead, but regardless, the net cost
+ // is about the same (not counting loop-invariant instructions).
+ if (IsVSXType || (ST->hasVSX() && IsAltivecType))
+ return Cost;
// PPC in general does not support unaligned loads and stores. They'll need
// to be decomposed based on the alignment factor.
- unsigned SrcBytes = LT.second.getStoreSize();
- if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
- Cost += LT.first*(SrcBytes/Alignment-1);
-
- // For a vector type, there is also scalarization overhead (only for
- // stores, loads are expanded using the vector-load + permutation sequence,
- // which is much less expensive).
- if (Src->isVectorTy() && Opcode == Instruction::Store)
- for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
- Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
- }
+
+ // Add the cost of each scalar load or store.
+ Cost += LT.first*(SrcBytes/Alignment-1);
+
+ // For a vector type, there is also scalarization overhead (only for
+ // stores, loads are expanded using the vector-load + permutation sequence,
+ // which is much less expensive).
+ if (Src->isVectorTy() && Opcode == Instruction::Store)
+ for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+
+ return Cost;
+}
+
+int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(isa<VectorType>(VecTy) &&
+ "Expect a vector type for interleaved memory op");
+
+ // Legalize the type.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+
+ // Firstly, the cost of load/store operation.
+ int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+ // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // (at least in the sense that there need only be one non-loop-invariant
+ // instruction). For each result vector, we need one shuffle per incoming
+ // vector (except that the first shuffle can take two incoming vectors
+ // because it does not need to take itself).
+ Cost += Factor*(LT.first-1);
return Cost;
}
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 368bef93f0dd..04c1b02235f0 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -37,7 +37,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
const PPCTargetLowering *getTLI() const { return TLI; }
public:
- explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F)
+ explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -52,12 +52,11 @@ public:
/// @{
using BaseT::getIntImmCost;
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
- unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
- unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
@@ -68,22 +67,27 @@ public:
/// @{
bool enableAggressiveInterleaving(bool LoopHasReductions);
+ bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
- unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 5e3ae2a4471b..782583ce3423 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -77,6 +77,14 @@ namespace {
return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
}
+ bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+ }
+
+ bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+ return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+ }
+
protected:
bool processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
@@ -100,7 +108,9 @@ protected:
IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
&PPC::VSLRCRegClass;
assert((IsF8Reg(SrcMO.getReg(), MRI) ||
- IsVRReg(SrcMO.getReg(), MRI)) &&
+ IsVRReg(SrcMO.getReg(), MRI) ||
+ IsVSSReg(SrcMO.getReg(), MRI) ||
+ IsVSFReg(SrcMO.getReg(), MRI)) &&
"Unknown source for a VSX copy");
unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
@@ -123,6 +133,8 @@ protected:
IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
&PPC::VSLRCRegClass;
assert((IsF8Reg(DstMO.getReg(), MRI) ||
+ IsVSFReg(DstMO.getReg(), MRI) ||
+ IsVSSReg(DstMO.getReg(), MRI) ||
IsVRReg(DstMO.getReg(), MRI)) &&
"Unknown destination for a VSX copy");
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 46b8d13e47b9..6b19a2f7118b 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -103,10 +103,10 @@ protected:
VNInfo *AddendValNo =
LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
- if (!AddendValNo) {
- // This can be null if the register is undef.
+
+ // This can be null if the register is undef.
+ if (!AddendValNo)
continue;
- }
MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
@@ -186,18 +186,17 @@ protected:
if (!KilledProdOp)
continue;
- // If the addend copy is used only by this MI, then the addend source
- // register is likely not live here. This could be fixed (based on the
- // legality checks above, the live range for the addend source register
- // could be extended), but it seems likely that such a trivial copy can
- // be coalesced away later, and thus is not worth the effort.
- if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+ // If the addend copy is used only by this MI, then the addend source
+ // register is likely not live here. This could be fixed (based on the
+ // legality checks above, the live range for the addend source register
+ // could be extended), but it seems likely that such a trivial copy can
+ // be coalesced away later, and thus is not worth the effort.
+ if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
!LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
continue;
// Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
- unsigned AddReg = AddendMI->getOperand(1).getReg();
unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg();
@@ -221,6 +220,14 @@ protected:
if (OldFMAReg == KilledProdReg)
continue;
+ // If there isn't a class that fits, we can't perform the transform.
+ // This is needed for correctness with a mixture of VSX and Altivec
+ // instructions to make sure that a low VSX register is not assigned to
+ // the Altivec instruction.
+ if (!MRI.constrainRegClass(KilledProdReg,
+ MRI.getRegClass(OldFMAReg)))
+ continue;
+
assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
"Addend copy not tied to old FMA output!");
@@ -228,7 +235,7 @@ protected:
MI->getOperand(0).setReg(KilledProdReg);
MI->getOperand(1).setReg(KilledProdReg);
- MI->getOperand(3).setReg(AddReg);
+ MI->getOperand(3).setReg(AddendSrcReg);
MI->getOperand(2).setReg(OtherProdReg);
MI->getOperand(0).setSubReg(KilledProdSubReg);
@@ -263,8 +270,7 @@ protected:
if (UseMI == AddendMI)
continue;
- UseMO.setReg(KilledProdReg);
- UseMO.setSubReg(KilledProdSubReg);
+ UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
}
// Extend the live intervals of the killed product operand to hold the
@@ -286,6 +292,20 @@ protected:
}
DEBUG(dbgs() << " extended: " << NewFMAInt << '\n');
+ // Extend the live interval of the addend source (it might end at the
+ // copy to be removed, or somewhere in between there and here). This
+ // is necessary only if it is a physical register.
+ if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+ for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
+ ++Units) {
+ unsigned Unit = *Units;
+
+ LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
+ AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
+ FMAIdx.getRegSlot());
+ DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n');
+ }
+
FMAInt.removeValNo(FMAValNo);
DEBUG(dbgs() << " trimmed: " << FMAInt << '\n');
@@ -347,7 +367,6 @@ INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
char PPCVSXFMAMutate::ID = 0;
-FunctionPass*
-llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
-
-
+FunctionPass *llvm::createPPCVSXFMAMutatePass() {
+ return new PPCVSXFMAMutate();
+}
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index d7132d5272d8..27c540fcf211 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -94,7 +94,7 @@ enum SHValues {
SH_NOSWAP_ST,
SH_SPLAT,
SH_XXPERMDI,
- SH_COPYSCALAR
+ SH_COPYWIDEN
};
struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -149,6 +149,11 @@ private:
// handling. Return true iff any changes are made.
bool removeSwaps();
+ // Insert a swap instruction from SrcReg to DstReg at the given
+ // InsertPoint.
+ void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
+ unsigned DstReg, unsigned SrcReg);
+
// Update instructions requiring special handling.
void handleSpecialSwappables(int EntryIdx);
@@ -159,9 +164,7 @@ private:
bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
if (TargetRegisterInfo::isVirtualRegister(Reg))
return RC->hasSubClassEq(MRI->getRegClass(Reg));
- if (RC->contains(Reg))
- return true;
- return false;
+ return RC->contains(Reg);
}
// Return true iff the given register is a full vector register.
@@ -215,7 +218,7 @@ public:
void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
- TII = static_cast<const PPCInstrInfo*>(MF->getSubtarget().getInstrInfo());
+ TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
// An initial vector size of 256 appears to work well in practice.
// Small/medium functions with vector content tend not to incur a
@@ -343,6 +346,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
SwapVector[VecIdx].IsLoad = 1;
SwapVector[VecIdx].IsSwap = 1;
break;
+ case PPC::LXSDX:
+ case PPC::LXSSPX:
+ // A load of a floating-point value into the high-order half of
+ // a vector register is safe, provided that we introduce a swap
+ // following the load, which will be done by the SUBREG_TO_REG
+ // support. So just mark these as safe.
+ SwapVector[VecIdx].IsLoad = 1;
+ SwapVector[VecIdx].IsSwappable = 1;
+ break;
case PPC::STVX:
// Non-permuting stores are currently unsafe. We can use special
// handling for this in the future. By not marking these as
@@ -385,7 +397,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
else if (isVecReg(MI.getOperand(0).getReg()) &&
isScalarVecReg(MI.getOperand(2).getReg())) {
SwapVector[VecIdx].IsSwappable = 1;
- SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
+ SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
}
break;
}
@@ -420,7 +432,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
case PPC::STVEHX:
case PPC::STVEWX:
case PPC::STVXL:
+ // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
+ // by adding special handling for narrowing copies as well as
+ // widening ones. However, I've experimented with this, and in
+ // practice we currently do not appear to use STXSDX fed by
+ // a narrowing copy from a full vector register. Since I can't
+ // generate any useful test cases, I've left this alone for now.
case PPC::STXSDX:
+ case PPC::STXSSPX:
case PPC::VCIPHER:
case PPC::VCIPHERLAST:
case PPC::VMRGHB:
@@ -543,7 +562,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
}
if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
- SwapVector[VecIdx].MentionsPhysVR = 1;
+ if (!isScalarVecReg(CopySrcReg))
+ SwapVector[VecIdx].MentionsPhysVR = 1;
return CopySrcReg;
}
@@ -629,8 +649,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
SwapVector[Repr].WebRejected = 1;
DEBUG(dbgs() <<
- format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
- Repr));
+ format("Web %d rejected for physreg, partial reg, or not "
+ "swap[pable]\n", Repr));
DEBUG(dbgs() << " in " << EntryIdx << ": ");
DEBUG(SwapVector[EntryIdx].VSEMI->dump());
DEBUG(dbgs() << "\n");
@@ -743,6 +763,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
}
}
+// Create an xxswapd instruction and insert it prior to the given point.
+// MI is used to determine basic block and debug loc information.
+// FIXME: When inserting a swap, we should check whether SrcReg is
+// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
+// then instead we should generate a copy from Reg to DstReg.
+void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
+ MachineBasicBlock::iterator InsertPoint,
+ unsigned DstReg, unsigned SrcReg) {
+ BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+ TII->get(PPC::XXPERMDI), DstReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg)
+ .addImm(2);
+}
+
// The identified swap entry requires special handling to allow its
// containing computation to be optimized. Perform that handling
// here.
@@ -752,8 +787,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
switch (SwapVector[EntryIdx].SpecialHandling) {
default:
- assert(false && "Unexpected special handling type");
- break;
+ llvm_unreachable("Unexpected special handling type");
// For splats based on an index into a vector, add N/2 modulo N
// to the index, where N is the number of vector elements.
@@ -766,7 +800,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
switch (MI->getOpcode()) {
default:
- assert(false && "Unexpected splat opcode");
+ llvm_unreachable("Unexpected splat opcode");
case PPC::VSPLTB: NElts = 16; break;
case PPC::VSPLTH: NElts = 8; break;
case PPC::VSPLTW: NElts = 4; break;
@@ -811,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
// For a copy from a scalar floating-point register to a vector
// register, removing swaps will leave the copied value in the
// wrong lane. Insert a swap following the copy to fix this.
- case SHValues::SH_COPYSCALAR: {
+ case SHValues::SH_COPYWIDEN: {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
@@ -825,14 +859,13 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
DEBUG(dbgs() << " Into: ");
DEBUG(MI->dump());
- MachineBasicBlock::iterator InsertPoint = MI->getNextNode();
+ auto InsertPoint = ++MachineBasicBlock::iterator(MI);
// Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
// is copying to a VRRC, we need to be careful to avoid a register
// assignment problem. In this case we must copy from VRRC to VSRC
// prior to the swap, and from VSRC to VRRC following the swap.
// Coalescing will usually remove all this mess.
-
if (DstRC == &PPC::VRRCRegClass) {
unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
@@ -840,29 +873,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::COPY), VSRCTmp1)
.addReg(NewVReg);
- DEBUG(MI->getNextNode()->dump());
+ DEBUG(std::prev(InsertPoint)->dump());
- BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
- TII->get(PPC::XXPERMDI), VSRCTmp2)
- .addReg(VSRCTmp1)
- .addReg(VSRCTmp1)
- .addImm(2);
- DEBUG(MI->getNextNode()->getNextNode()->dump());
+ insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
+ DEBUG(std::prev(InsertPoint)->dump());
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::COPY), DstReg)
.addReg(VSRCTmp2);
- DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
+ DEBUG(std::prev(InsertPoint)->dump());
} else {
-
- BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
- TII->get(PPC::XXPERMDI), DstReg)
- .addReg(NewVReg)
- .addReg(NewVReg)
- .addImm(2);
-
- DEBUG(MI->getNextNode()->dump());
+ insertSwap(MI, InsertPoint, DstReg, NewVReg);
+ DEBUG(std::prev(InsertPoint)->dump());
}
break;
}
@@ -947,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
case SH_XXPERMDI:
DEBUG(dbgs() << "special:xxpermdi ");
break;
- case SH_COPYSCALAR:
- DEBUG(dbgs() << "special:copyscalar ");
+ case SH_COPYWIDEN:
+ DEBUG(dbgs() << "special:copywiden ");
break;
}
}
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 1c4e486da418..a55274744fd1 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -34,7 +35,6 @@ namespace {
class SparcOperand;
class SparcAsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
MCAsmParser &Parser;
/// @name Auto-generated Match Functions
@@ -69,6 +69,10 @@ class SparcAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
+ // Helper function for dealing with %lo / %hi in PIC mode.
+ const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
+ const MCExpr *subExpr);
+
// returns true if Tok is matched to a register and returns register in RegNo.
bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
unsigned &RegKind);
@@ -77,24 +81,24 @@ class SparcAsmParser : public MCTargetAsmParser {
bool parseDirectiveWord(unsigned Size, SMLoc L);
bool is64Bit() const {
- return STI.getTargetTriple().getArch() == Triple::sparcv9;
+ return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
}
void expandSET(MCInst &Inst, SMLoc IDLoc,
SmallVectorImpl<MCInst> &Instructions);
public:
- SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+ SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(sti), Parser(parser) {
+ : MCTargetAsmParser(Options, sti), Parser(parser) {
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
};
- static unsigned IntRegs[32] = {
+ static const MCPhysReg IntRegs[32] = {
Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
@@ -104,7 +108,7 @@ public:
Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
- static unsigned FloatRegs[32] = {
+ static const MCPhysReg FloatRegs[32] = {
Sparc::F0, Sparc::F1, Sparc::F2, Sparc::F3,
Sparc::F4, Sparc::F5, Sparc::F6, Sparc::F7,
Sparc::F8, Sparc::F9, Sparc::F10, Sparc::F11,
@@ -114,7 +118,7 @@ public:
Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
- static unsigned DoubleRegs[32] = {
+ static const MCPhysReg DoubleRegs[32] = {
Sparc::D0, Sparc::D1, Sparc::D2, Sparc::D3,
Sparc::D4, Sparc::D5, Sparc::D6, Sparc::D7,
Sparc::D8, Sparc::D7, Sparc::D8, Sparc::D9,
@@ -124,13 +128,13 @@ public:
Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
- static unsigned QuadFPRegs[32] = {
+ static const MCPhysReg QuadFPRegs[32] = {
Sparc::Q0, Sparc::Q1, Sparc::Q2, Sparc::Q3,
Sparc::Q4, Sparc::Q5, Sparc::Q6, Sparc::Q7,
Sparc::Q8, Sparc::Q9, Sparc::Q10, Sparc::Q11,
Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
- static unsigned ASRRegs[32] = {
+ static const MCPhysReg ASRRegs[32] = {
SP::Y, SP::ASR1, SP::ASR2, SP::ASR3,
SP::ASR4, SP::ASR5, SP::ASR6, SP::ASR7,
SP::ASR8, SP::ASR9, SP::ASR10, SP::ASR11,
@@ -140,6 +144,12 @@ public:
SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+ static const MCPhysReg IntPairRegs[] = {
+ Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7,
+ Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7,
+ Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
+ Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
+
/// SparcOperand - Instances of this class represent a parsed Sparc machine
/// instruction.
class SparcOperand : public MCParsedAsmOperand {
@@ -147,6 +157,7 @@ public:
enum RegisterKind {
rk_None,
rk_IntReg,
+ rk_IntPairReg,
rk_FloatReg,
rk_DoubleReg,
rk_QuadReg,
@@ -200,6 +211,10 @@ public:
bool isMEMrr() const { return Kind == k_MemoryReg; }
bool isMEMri() const { return Kind == k_MemoryImm; }
+ bool isIntReg() const {
+ return (Kind == k_Register && Reg.Kind == rk_IntReg);
+ }
+
bool isFloatReg() const {
return (Kind == k_Register && Reg.Kind == rk_FloatReg);
}
@@ -330,6 +345,25 @@ public:
return Op;
}
+ static bool MorphToIntPairReg(SparcOperand &Op) {
+ unsigned Reg = Op.getReg();
+ assert(Op.Reg.Kind == rk_IntReg);
+ unsigned regIdx = 32;
+ if (Reg >= Sparc::G0 && Reg <= Sparc::G7)
+ regIdx = Reg - Sparc::G0;
+ else if (Reg >= Sparc::O0 && Reg <= Sparc::O7)
+ regIdx = Reg - Sparc::O0 + 8;
+ else if (Reg >= Sparc::L0 && Reg <= Sparc::L7)
+ regIdx = Reg - Sparc::L0 + 16;
+ else if (Reg >= Sparc::I0 && Reg <= Sparc::I7)
+ regIdx = Reg - Sparc::I0 + 24;
+ if (regIdx % 2 || regIdx > 31)
+ return false;
+ Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+ Op.Reg.Kind = rk_IntPairReg;
+ return true;
+ }
+
static bool MorphToDoubleReg(SparcOperand &Op) {
unsigned Reg = Op.getReg();
assert(Op.Reg.Kind == rk_FloatReg);
@@ -407,7 +441,22 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
// the imm operand can be either an expression or an immediate.
bool IsImm = Inst.getOperand(1).isImm();
- uint64_t ImmValue = IsImm ? MCValOp.getImm() : 0;
+ int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0;
+
+ // Allow either a signed or unsigned 32-bit immediate.
+ if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) {
+ Error(IDLoc, "set: argument must be between -2147483648 and 4294967295");
+ return;
+ }
+
+ // If the value was expressed as a large unsigned number, that's ok.
+ // We want to see if it "looks like" a small signed number.
+ int32_t ImmValue = RawImmValue;
+ // For 'set' you can't use 'or' with a negative operand on V9 because
+ // that would splat the sign bit across the upper half of the destination
+ // register, whereas 'set' is defined to zero the high 32 bits.
+ bool IsEffectivelyImm13 =
+ IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096);
const MCExpr *ValExpr;
if (IsImm)
ValExpr = MCConstantExpr::create(ImmValue, getContext());
@@ -416,10 +465,12 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
MCOperand PrevReg = MCOperand::createReg(Sparc::G0);
- if (!IsImm || (ImmValue & ~0x1fff)) {
+ // If not just a signed imm13 value, then either we use a 'sethi' with a
+ // following 'or', or a 'sethi' by itself if there are no more 1 bits.
+ // In either case, start with the 'sethi'.
+ if (!IsEffectivelyImm13) {
MCInst TmpInst;
- const MCExpr *Expr =
- SparcMCExpr::create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext());
+ const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr);
TmpInst.setLoc(IDLoc);
TmpInst.setOpcode(SP::SETHIi);
TmpInst.addOperand(MCRegOp);
@@ -428,10 +479,23 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
PrevReg = MCRegOp;
}
- if (!IsImm || ((ImmValue & 0x1fff) != 0 || ImmValue == 0)) {
+ // The low bits require touching in 3 cases:
+ // * A non-immediate value will always require both instructions.
+ // * An effectively imm13 value needs only an 'or' instruction.
+ // * Otherwise, an immediate that is not effectively imm13 requires the
+ // 'or' only if bits remain after clearing the 22 bits that 'sethi' set.
+ // If the low bits are known zeros, there's nothing to do.
+ // In the second case, and only in that case, must we NOT clear
+ // bits of the immediate value via the %lo() assembler function.
+ // Note also, the 'or' instruction doesn't mind a large value in the case
+ // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean.
+ if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) {
MCInst TmpInst;
- const MCExpr *Expr =
- SparcMCExpr::create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext());
+ const MCExpr *Expr;
+ if (IsEffectivelyImm13)
+ Expr = ValExpr;
+ else
+ Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr);
TmpInst.setLoc(IDLoc);
TmpInst.setOpcode(SP::ORri);
TmpInst.addOperand(MCRegOp);
@@ -463,7 +527,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
for (const MCInst &I : Instructions) {
- Out.EmitInstruction(I, STI);
+ Out.EmitInstruction(I, getSTI());
}
return false;
}
@@ -742,6 +806,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
case Sparc::PSR:
Op = SparcOperand::CreateToken("%psr", S);
break;
+ case Sparc::FSR:
+ Op = SparcOperand::CreateToken("%fsr", S);
+ break;
case Sparc::WIM:
Op = SparcOperand::CreateToken("%wim", S);
break;
@@ -766,6 +833,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
case AsmToken::Minus:
case AsmToken::Integer:
case AsmToken::LParen:
+ case AsmToken::Dot:
if (!getParser().parseExpression(EVal, E))
Op = SparcOperand::CreateImm(EVal, S, E);
break;
@@ -848,6 +916,13 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
return true;
}
+ // %fprs is an alias of %asr6.
+ if (name.equals("fprs")) {
+ RegNo = ASRRegs[6];
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
if (name.equals("icc")) {
RegNo = Sparc::ICC;
RegKind = SparcOperand::rk_Special;
@@ -860,6 +935,12 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
return true;
}
+ if (name.equals("fsr")) {
+ RegNo = Sparc::FSR;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+
if (name.equals("wim")) {
RegNo = Sparc::WIM;
RegKind = SparcOperand::rk_Special;
@@ -943,6 +1024,82 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
RegKind = SparcOperand::rk_IntReg;
return true;
}
+
+ if (name.equals("tpc")) {
+ RegNo = Sparc::TPC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tnpc")) {
+ RegNo = Sparc::TNPC;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tstate")) {
+ RegNo = Sparc::TSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tt")) {
+ RegNo = Sparc::TT;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tick")) {
+ RegNo = Sparc::TICK;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tba")) {
+ RegNo = Sparc::TBA;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("pstate")) {
+ RegNo = Sparc::PSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("tl")) {
+ RegNo = Sparc::TL;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("pil")) {
+ RegNo = Sparc::PIL;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cwp")) {
+ RegNo = Sparc::CWP;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cansave")) {
+ RegNo = Sparc::CANSAVE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("canrestore")) {
+ RegNo = Sparc::CANRESTORE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("cleanwin")) {
+ RegNo = Sparc::CLEANWIN;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("otherwin")) {
+ RegNo = Sparc::OTHERWIN;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
+ if (name.equals("wstate")) {
+ RegNo = Sparc::WSTATE;
+ RegKind = SparcOperand::rk_Special;
+ return true;
+ }
}
return false;
}
@@ -975,6 +1132,32 @@ static bool hasGOTReference(const MCExpr *Expr) {
return false;
}
+const SparcMCExpr *
+SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
+ const MCExpr *subExpr)
+{
+ // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
+ // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+ // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
+ // as %got10 or %got22 relocation.
+
+ if (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) {
+ switch(VK) {
+ default: break;
+ case SparcMCExpr::VK_Sparc_LO:
+ VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10
+ : SparcMCExpr::VK_Sparc_GOT10);
+ break;
+ case SparcMCExpr::VK_Sparc_HI:
+ VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22
+ : SparcMCExpr::VK_Sparc_GOT22);
+ break;
+ }
+ }
+
+ return SparcMCExpr::create(VK, subExpr, getContext());
+}
+
bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
SMLoc &EndLoc)
{
@@ -998,30 +1181,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
if (Parser.parseParenExpression(subExpr, EndLoc))
return false;
- bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_;
-
- // Ugly: if a sparc assembly expression says "%hi(...)" but the
- // expression within contains _GLOBAL_OFFSET_TABLE_, it REALLY means
- // %pc22. Same with %lo -> %pc10. Worse, if it doesn't contain that,
- // the meaning depends on whether the assembler was invoked with
- // -KPIC or not: if so, it really means %got22/%got10; if not, it
- // actually means what it said! Sigh, historical mistakes...
-
- switch(VK) {
- default: break;
- case SparcMCExpr::VK_Sparc_LO:
- VK = (hasGOTReference(subExpr)
- ? SparcMCExpr::VK_Sparc_PC10
- : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK));
- break;
- case SparcMCExpr::VK_Sparc_HI:
- VK = (hasGOTReference(subExpr)
- ? SparcMCExpr::VK_Sparc_PC22
- : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK));
- break;
- }
-
- EVal = SparcMCExpr::create(VK, subExpr, getContext());
+ EVal = adjustPICRelocation(VK, subExpr);
return true;
}
@@ -1051,5 +1211,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
break;
}
}
+ if (Op.isIntReg() && Kind == MCK_IntPair) {
+ if (SparcOperand::MorphToIntPairReg(Op))
+ return MCTargetAsmParser::Match_Success;
+ }
return Match_InvalidOperand;
}
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 38bff44e7542..c689b7f7201e 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
continue;
}
+ // TODO: If we ever want to support v7, this needs to be extended
+ // to cover all floating point operations.
if (!Subtarget->isV9() &&
(MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
|| MI->getOpcode() == SP::FCMPQ)) {
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 3e56b9e9b883..51751ec511c9 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -117,6 +117,19 @@ static const unsigned ASRRegDecoderTable[] = {
SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+static const unsigned PRRegDecoderTable[] = {
+ SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE,
+ SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN,
+ SP::OTHERWIN, SP::WSTATE
+};
+
+static const uint16_t IntPairDecoderTable[] = {
+ SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7,
+ SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7,
+ SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7,
+ SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
+};
+
static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
unsigned RegNo,
uint64_t Address,
@@ -196,9 +209,34 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= array_lengthof(PRRegDecoderTable))
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo]));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ if ((RegNo & 1))
+ S = MCDisassembler::SoftFail;
+
+ unsigned RegisterPair = IntPairDecoderTable[RegNo/2];
+ Inst.addOperand(MCOperand::createReg(RegisterPair));
+ return S;
+}
static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
const void *Decoder);
static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
@@ -207,6 +245,8 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
const void *Decoder);
static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
@@ -326,6 +366,12 @@ static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
DecodeIntRegsRegisterClass);
}
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+ const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, true,
+ DecodeIntPairRegisterClass);
+}
+
static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
const void *Decoder) {
return DecodeMem(Inst, insn, Address, Decoder, true,
@@ -350,6 +396,12 @@ static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
DecodeIntRegsRegisterClass);
}
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+ uint64_t Address, const void *Decoder) {
+ return DecodeMem(Inst, insn, Address, Decoder, false,
+ DecodeIntPairRegisterClass);
+}
+
static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
const void *Decoder) {
return DecodeMem(Inst, insn, Address, Decoder, false,
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 0b01b88e5250..6f06d1ddae32 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -15,12 +15,9 @@
#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
-class MCOperand;
-
class SparcInstPrinter : public MCInstPrinter {
public:
SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 12386f14443e..ad441227600e 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -21,6 +21,7 @@ class Triple;
class SparcELFMCAsmInfo : public MCAsmInfoELF {
void anchor() override;
+
public:
explicit SparcELFMCAsmInfo(const Triple &TheTriple);
const MCExpr*
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index d08ad86dbe04..13f08195c764 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -90,8 +90,8 @@ public:
const MCAsmLayout *Layout,
const MCFixup *Fixup) const override;
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override {
- return getSubExpr()->findAssociatedSection();
+ MCFragment *findAssociatedFragment() const override {
+ return getSubExpr()->findAssociatedFragment();
}
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index c5f046bfc5bb..e3b0f5266747 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -267,11 +267,11 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
return;
}
- MachineBasicBlock::const_instr_iterator I = MI;
+ MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
do {
MCInst TmpInst;
- LowerSparcMachineInstrToMCInst(I, TmpInst, *this);
+ LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this);
EmitToStreamer(*OutStreamer, TmpInst);
} while ((++I != E) && I->isInsideBundle()); // Delay slot check.
}
@@ -296,7 +296,7 @@ void SparcAsmPrinter::EmitFunctionBodyStart() {
void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
raw_ostream &O) {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
const MachineOperand &MO = MI->getOperand (opNum);
SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
@@ -373,7 +373,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
O << MO.getSymbolName();
break;
case MachineOperand::MO_ConstantPoolIndex:
- O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
<< MO.getIndex();
break;
default:
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index dfaaabf344a3..0aa29d186dc1 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -21,7 +21,11 @@ def CC_Sparc32 : CallingConv<[
// i32 f32 arguments get passed in integer registers if there is space.
CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
// f64 arguments are split and passed through registers or through stack.
- CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>,
+ CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>,
+ // As are v2i32 arguments (this would be the default behavior for
+ // v2i32 if it wasn't allocated to the IntPair register-class)
+ CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>,
+
// Alternatively, they are assigned to the stack in 4-byte aligned units.
CCAssignToStack<4, 4>
@@ -30,7 +34,8 @@ def CC_Sparc32 : CallingConv<[
def RetCC_Sparc32 : CallingConv<[
CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
- CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+ CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+ CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">>
]>;
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index c0279daa63d9..39b5e809c9be 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -44,7 +44,7 @@ void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
unsigned ADDrr,
unsigned ADDri) const {
- DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+ DebugLoc dl;
const SparcInstrInfo &TII =
*static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
@@ -90,8 +90,23 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
MachineFrameInfo *MFI = MF.getFrameInfo();
const SparcInstrInfo &TII =
*static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SparcRegisterInfo &RegInfo =
+ *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
MachineBasicBlock::iterator MBBI = MBB.begin();
- DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc dl;
+ bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
+ // FIXME: unfortunately, returning false from canRealignStack
+ // actually just causes needsStackRealignment to return false,
+ // rather than reporting an error, as would be sensible. This is
+ // poor, but fixing that bogosity is going to be a large project.
+ // For now, just see if it's lied, and report an error here.
+ if (!NeedsStackRealignment && MFI->getMaxAlignment() > getStackAlignment())
+ report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
+ "stack re-alignment, but LLVM couldn't handle it "
+ "(probably because it has a dynamic alloca).");
// Get the number of bytes to allocate from the FrameInfo
int NumBytes = (int) MFI->getStackSize();
@@ -104,12 +119,43 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
SAVEri = SP::ADDri;
SAVErr = SP::ADDrr;
}
- NumBytes = -MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
- emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
+
+ // The SPARC ABI is a bit odd in that it requires a reserved 92-byte
+ // (128 in v9) area in the user's stack, starting at %sp. Thus, the
+ // first part of the stack that can actually be used is located at
+ // %sp + 92.
+ //
+ // We therefore need to add that offset to the total stack size
+ // after all the stack objects are placed by
+ // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be
+ // aligned *after* the extra size is added, we need to disable
+ // calculateFrameObjectOffsets's built-in stack alignment, by having
+ // targetHandlesStackFrameRounding return true.
+
+
+ // Add the extra call frame stack size, if needed. (This is the same
+ // code as in PrologEpilogInserter, but also gets disabled by
+ // targetHandlesStackFrameRounding)
+ if (MFI->adjustsStack() && hasReservedCallFrame(MF))
+ NumBytes += MFI->getMaxCallFrameSize();
+
+ // Adds the SPARC subtarget-specific spill area to the stack
+ // size. Also ensures target-required alignment.
+ NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
+
+ // Finally, ensure that the size is sufficiently aligned for the
+ // data on the stack.
+ if (MFI->getMaxAlignment() > 0) {
+ NumBytes = RoundUpToAlignment(NumBytes, MFI->getMaxAlignment());
+ }
+
+ // Update stack size with corrected value.
+ MFI->setStackSize(NumBytes);
+
+ emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri);
MachineModuleInfo &MMI = MF.getMMI();
- const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+ unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true);
// Emit ".cfi_def_cfa_register 30".
unsigned CFIIndex =
@@ -122,13 +168,19 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
- unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
- unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+ unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true);
+ unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true);
// Emit ".cfi_register 15, 31".
CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
+
+ if (NeedsStackRealignment) {
+ // andn %o6, MaxAlign-1, %o6
+ int MaxAlign = MFI->getMaxAlignment();
+ BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
+ }
}
void SparcFrameLowering::
@@ -167,7 +219,6 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
if (NumBytes == 0)
return;
- NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
}
@@ -180,21 +231,69 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
// pointer register. This is true if the function has variable sized allocas or
// if frame pointer elimination is disabled.
bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
const MachineFrameInfo *MFI = MF.getFrameInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+ RegInfo->needsStackRealignment(MF) ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken();
}
+int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
+ const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+ bool isFixed = MFI->isFixedObjectIndex(FI);
+
+ // Addressable stack objects are accessed using neg. offsets from
+ // %fp, or positive offsets from %sp.
+ bool UseFP;
+
+ // Sparc uses FP-based references in general, even when "hasFP" is
+ // false. That function is rather a misnomer, because %fp is
+ // actually always available, unless isLeafProc.
+ if (FuncInfo->isLeafProc()) {
+ // If there's a leaf proc, all offsets need to be %sp-based,
+ // because we haven't caused %fp to actually point to our frame.
+ UseFP = false;
+ } else if (isFixed) {
+ // Otherwise, argument access should always use %fp.
+ UseFP = true;
+ } else if (RegInfo->needsStackRealignment(MF)) {
+ // If there is dynamic stack realignment, all local object
+ // references need to be via %sp, to take account of the
+ // re-alignment.
+ UseFP = false;
+ } else {
+ // Finally, default to using %fp.
+ UseFP = true;
+ }
+
+ int64_t FrameOffset = MF.getFrameInfo()->getObjectOffset(FI) +
+ Subtarget.getStackPointerBias();
+
+ if (UseFP) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FrameOffset;
+ } else {
+ FrameReg = SP::O6; // %sp
+ return FrameOffset + MF.getFrameInfo()->getStackSize();
+ }
+}
+
static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
{
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
- if (MRI->isPhysRegUsed(reg))
+ if (!MRI->reg_nodbg_empty(reg))
return false;
for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
- if (MRI->isPhysRegUsed(reg))
+ if (!MRI->reg_nodbg_empty(reg))
return false;
return true;
@@ -206,33 +305,42 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineFrameInfo *MFI = MF.getFrameInfo();
- return !(MFI->hasCalls() // has calls
- || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
- || MRI.isPhysRegUsed(SP::O6) // %SP is used
- || hasFP(MF)); // need %FP
+ return !(MFI->hasCalls() // has calls
+ || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+ || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+ || hasFP(MF)); // need %FP
}
void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
-
MachineRegisterInfo &MRI = MF.getRegInfo();
-
// Remap %i[0-7] to %o[0-7].
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
- if (!MRI.isPhysRegUsed(reg))
+ if (MRI.reg_nodbg_empty(reg))
continue;
- unsigned mapped_reg = (reg - SP::I0 + SP::O0);
- assert(!MRI.isPhysRegUsed(mapped_reg));
+
+ unsigned mapped_reg = reg - SP::I0 + SP::O0;
+ assert(MRI.reg_nodbg_empty(mapped_reg));
// Replace I register with O register.
MRI.replaceRegWith(reg, mapped_reg);
- // Mark the reg unused.
- MRI.setPhysRegUnused(reg);
+ // Also replace register pair super-registers.
+ if ((reg - SP::I0) % 2 == 0) {
+ unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
+ unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
+ MRI.replaceRegWith(preg, mapped_preg);
+ }
}
// Rewrite MBB's Live-ins.
for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
MBB != E; ++MBB) {
+ for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
+ if (!MBB->isLiveIn(reg))
+ continue;
+ MBB->removeLiveIn(reg);
+ MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+ }
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
if (!MBB->isLiveIn(reg))
continue;
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 29fc7b7ba036..cbb4dc04fc23 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -39,6 +39,14 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
+
+ /// targetHandlesStackFrameRounding - Returns true if the target is
+ /// responsible for rounding up the stack frame (probably at emitPrologue
+ /// time).
+ bool targetHandlesStackFrameRounding() const override { return true; }
+
private:
// Remap input registers to output registers for leaf procedure.
void remapRegsForLeafProc(MachineFunction &MF) const;
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 340b72e7940f..c4c641659df3 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Compiler.h"
@@ -62,6 +63,7 @@ public:
private:
SDNode* getGlobalBaseReg();
+ SDNode *SelectInlineAsm(SDNode *N);
};
} // end anonymous namespace
@@ -141,6 +143,181 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
return true;
}
+
+// Re-assemble i64 arguments split up in SelectionDAGBuilder's
+// visitInlineAsm / GetRegistersForValue functions.
+//
+// Note: This function was copied from, and is essentially identical
+// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that
+// such hacking-up is necessary; a rethink of how inline asm operands
+// are handled may be in order to make doing this more sane.
+//
+// TODO: fix inline asm support so I can simply tell it that 'i64'
+// inputs to asm need to be allocated to the IntPair register type,
+// and have that work. Then, delete this function.
+SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){
+ std::vector<SDValue> AsmNodeOperands;
+ unsigned Flag, Kind;
+ bool Changed = false;
+ unsigned NumOps = N->getNumOperands();
+
+ // Normally, i64 data is bounded to two arbitrary GPRs for "%r"
+ // constraint. However, some instructions (e.g. ldd/std) require
+ // (even/even+1) GPRs.
+
+ // So, here, we check for this case, and mutate the inlineasm to use
+ // a single IntPair register instead, which guarantees such even/odd
+ // placement.
+
+ SDLoc dl(N);
+ SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+ : SDValue(nullptr,0);
+
+ SmallVector<bool, 8> OpChanged;
+ // Glue node will be appended late.
+ for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+ SDValue op = N->getOperand(i);
+ AsmNodeOperands.push_back(op);
+
+ if (i < InlineAsm::Op_FirstOperand)
+ continue;
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+ Flag = C->getZExtValue();
+ Kind = InlineAsm::getKind(Flag);
+ }
+ else
+ continue;
+
+ // Immediate operands to inline asm in the SelectionDAG are modeled with
+ // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+ // the second is a constant with the value of the immediate. If we get here
+ // and we have a Kind_Imm, skip the next operand, and continue.
+ if (Kind == InlineAsm::Kind_Imm) {
+ SDValue op = N->getOperand(++i);
+ AsmNodeOperands.push_back(op);
+ continue;
+ }
+
+ unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+ if (NumRegs)
+ OpChanged.push_back(false);
+
+ unsigned DefIdx = 0;
+ bool IsTiedToChangedOp = false;
+ // If it's a use that is tied with a previous def, it has no
+ // reg class constraint.
+ if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+ IsTiedToChangedOp = OpChanged[DefIdx];
+
+ if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+ && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+ continue;
+
+ unsigned RC;
+ bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+ if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID))
+ || NumRegs != 2)
+ continue;
+
+ assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+ SDValue V0 = N->getOperand(i+1);
+ SDValue V1 = N->getOperand(i+2);
+ unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+ unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+ SDValue PairedReg;
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ if (Kind == InlineAsm::Kind_RegDef ||
+ Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+ // the original GPRs.
+
+ unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+ SDValue Chain = SDValue(N,0);
+
+ SDNode *GU = N->getGluedUser();
+ SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32,
+ Chain.getValue(1));
+
+ // Extract values from a GPRPair reg and copy to the original GPR reg.
+ SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32,
+ RegCopy);
+ SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32,
+ RegCopy);
+ SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+ RegCopy.getValue(1));
+ SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+ // Update the original glue user.
+ std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+ Ops.push_back(T1.getValue(1));
+ CurDAG->UpdateNodeOperands(GU, Ops);
+ }
+ else {
+ // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+ // GPRPair and then pass the GPRPair to the inline asm.
+ SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+ // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+ SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+ Chain.getValue(1));
+ SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+ T0.getValue(1));
+ SDValue Pair = SDValue(
+ CurDAG->getMachineNode(
+ TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32,
+ {
+ CurDAG->getTargetConstant(SP::IntPairRegClassID, dl,
+ MVT::i32),
+ T0,
+ CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32),
+ T1,
+ CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32),
+ }),
+ 0);
+
+ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+ // i32 VRs of inline asm with it.
+ unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+ Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+ AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+ Glue = Chain.getValue(1);
+ }
+
+ Changed = true;
+
+ if(PairedReg.getNode()) {
+ OpChanged[OpChanged.size() -1 ] = true;
+ Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+ if (IsTiedToChangedOp)
+ Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+ else
+ Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID);
+ // Replace the current flag.
+ AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+ Flag, dl, MVT::i32);
+ // Add the new register node and skip the original two GPRs.
+ AsmNodeOperands.push_back(PairedReg);
+ // Skip the next two GPRs.
+ i += 2;
+ }
+ }
+
+ if (Glue.getNode())
+ AsmNodeOperands.push_back(Glue);
+ if (!Changed)
+ return nullptr;
+
+ SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+ CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+ New->setNodeId(-1);
+ return New.getNode();
+}
+
SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
SDLoc dl(N);
if (N->isMachineOpcode()) {
@@ -150,6 +327,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
switch (N->getOpcode()) {
default: break;
+ case ISD::INLINEASM: {
+ SDNode *ResNode = SelectInlineAsm(N);
+ if (ResNode)
+ return ResNode;
+ break;
+ }
case SPISD::GLOBAL_BASE_REG:
return getGlobalBaseReg();
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 4879d4ee79e5..5e70ffe2223c 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -49,9 +49,9 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
return true;
}
-static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT, CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State)
+static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State)
{
static const MCPhysReg RegList[] = {
SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
@@ -77,6 +77,29 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
return true;
}
+static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+ static const MCPhysReg RegList[] = {
+ SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+ };
+
+ // Try to get first reg.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ return false;
+
+ // Try to get second reg.
+ if (unsigned Reg = State.AllocateReg(RegList))
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ else
+ return false;
+
+ return true;
+}
+
// Allocate a full-sized argument for the 64-bit ABI.
static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
@@ -202,12 +225,34 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
RetOps.push_back(SDValue());
// Copy the result values into the output registers.
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
- Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(),
- OutVals[i], Flag);
+ SDValue Arg = OutVals[realRVLocIdx];
+
+ if (VA.needsCustom()) {
+ assert(VA.getLocVT() == MVT::v2i32);
+ // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
+ // happen by default if this wasn't a legal type)
+
+ SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Arg,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Arg,
+ DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ VA = RVLocs[++i]; // skip ahead to next loc
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
+ Flag);
+ } else
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
// Guarantee that all emitted copies are stuck together with flags.
Flag = Chain.getValue(1);
@@ -355,6 +400,7 @@ LowerFormalArguments_32(SDValue Chain,
CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
const unsigned StackOffset = 92;
+ bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
unsigned InIdx = 0;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
@@ -375,7 +421,8 @@ LowerFormalArguments_32(SDValue Chain,
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
- assert(VA.getLocVT() == MVT::f64);
+ assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
@@ -396,9 +443,13 @@ LowerFormalArguments_32(SDValue Chain,
&SP::IntRegsRegClass);
LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
}
+
+ if (IsLittleEndian)
+ std::swap(LoVal, HiVal);
+
SDValue WholeValue =
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
- WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+ WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
InVals.push_back(WholeValue);
continue;
}
@@ -422,7 +473,7 @@ LowerFormalArguments_32(SDValue Chain,
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (VA.needsCustom()) {
- assert(VA.getValVT() == MVT::f64);
+ assert(VA.getValVT() == MVT::f64 || MVT::v2i32);
// If it is double-word aligned, just load.
if (Offset % 8 == 0) {
int FI = MF.getFrameInfo()->CreateFixedObject(8,
@@ -452,9 +503,12 @@ LowerFormalArguments_32(SDValue Chain,
MachinePointerInfo(),
false, false, false, 0);
+ if (IsLittleEndian)
+ std::swap(LoVal, HiVal);
+
SDValue WholeValue =
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
- WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+ WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
InVals.push_back(WholeValue);
continue;
}
@@ -468,16 +522,12 @@ LowerFormalArguments_32(SDValue Chain,
Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
MachinePointerInfo(),
false, false, false, 0);
+ } else if (VA.getValVT() == MVT::f128) {
+ report_fatal_error("SPARCv8 does not handle f128 in calls; "
+ "pass indirectly");
} else {
- ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
- // Sparc is big endian, so add an offset based on the ObjectVT.
- unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8);
- FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
- DAG.getConstant(Offset, dl, MVT::i32));
- Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
- MachinePointerInfo(),
- VA.getValVT(), false, false, false,0);
- Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load);
+ // We shouldn't see any other value types here.
+ llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
}
InVals.push_back(Load);
}
@@ -612,7 +662,7 @@ LowerFormalArguments_64(SDValue Chain,
InVals.push_back(DAG.getLoad(
VA.getValVT(), DL, Chain,
DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
- MachinePointerInfo::getFixedStack(FI), false, false, false, 0));
+ MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0));
}
if (!IsVarArg)
@@ -640,9 +690,9 @@ LowerFormalArguments_64(SDValue Chain,
SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
auto PtrVT = getPointerTy(MF.getDataLayout());
- OutChains.push_back(
- DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
- MachinePointerInfo::getFixedStack(FI), false, false, 0));
+ OutChains.push_back(DAG.getStore(
+ Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+ MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
}
if (!OutChains.empty())
@@ -788,7 +838,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
}
if (VA.needsCustom()) {
- assert(VA.getLocVT() == MVT::f64);
+ assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
if (VA.isMemLoc()) {
unsigned Offset = VA.getLocMemOffset() + StackOffset;
@@ -804,49 +854,53 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
}
}
- SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
- Arg, StackPtr, MachinePointerInfo(),
- false, false, 0);
- // Sparc is big-endian, so the high part comes first.
- SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
- MachinePointerInfo(), false, false, false, 0);
- // Increment the pointer to the other half.
- StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
- DAG.getIntPtrConstant(4, dl));
- // Load the low part.
- SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
- MachinePointerInfo(), false, false, false, 0);
+ if (VA.getLocVT() == MVT::f64) {
+ // Move from the float value from float registers into the
+ // integer registers.
+
+ // TODO: The f64 -> v2i32 conversion is super-inefficient for
+ // constants: it sticks them in the constant pool, then loads
+ // to a fp register, then stores to temp memory, then loads to
+ // integer registers.
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
+ }
+
+ SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Arg,
+ DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Arg,
+ DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));
if (VA.isRegLoc()) {
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi));
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
assert(i+1 != e);
CCValAssign &NextVA = ArgLocs[++i];
if (NextVA.isRegLoc()) {
- RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
} else {
- // Store the low part in stack.
+ // Store the second part in stack.
unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
- MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+ MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
MachinePointerInfo(),
false, false, 0));
}
} else {
unsigned Offset = VA.getLocMemOffset() + StackOffset;
- // Store the high part.
+ // Store the first part.
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
- MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff,
+ MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff,
MachinePointerInfo(),
false, false, 0));
- // Store the low part.
+ // Store the second part.
PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
- MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+ MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
MachinePointerInfo(),
false, false, 0));
}
@@ -990,8 +1044,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
if (!CalleeFn)
return 0;
- assert(CalleeFn->hasStructRetAttr() &&
- "Callee does not have the StructRet attribute.");
+ // It would be nice to check for the sret attribute on CalleeFn here,
+ // but since it is not part of the function type, any check will misfire.
PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
Type *ElementTy = Ty->getElementType();
@@ -1370,15 +1424,60 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
const SparcSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
- auto &DL = *TM.getDataLayout();
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+ // Instructions which use registers as conditionals examine all the
+ // bits (as does the pseudo SELECT_CC expansion). I don't think it
+ // matters much whether it's ZeroOrOneBooleanContent, or
+ // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
+ // former.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent);
// Set up the register classes.
addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
- if (Subtarget->is64Bit())
+ if (Subtarget->is64Bit()) {
addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+ } else {
+ // On 32bit sparc, we define a double-register 32bit register
+ // class, as well. This is modeled in LLVM as a 2-vector of i32.
+ addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
+
+ // ...but almost all operations must be expanded, so set that as
+ // the default.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+ setOperationAction(Op, MVT::v2i32, Expand);
+ }
+ // Truncating/extending stores/loads are also not supported.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
+
+ setTruncStoreAction(VT, MVT::v2i32, Expand);
+ setTruncStoreAction(MVT::v2i32, VT, Expand);
+ }
+ // However, load and store *are* legal.
+ setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
+ setOperationAction(ISD::STORE, MVT::v2i32, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
+
+ // And we need to promote i64 loads/stores into vector load/store
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
+
+ // Sadly, this doesn't work:
+ // AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+ // AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+ }
// Turn FP extload into load/fextend
for (MVT VT : MVT::fp_valuetypes()) {
@@ -1396,10 +1495,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
// Custom legalize GlobalAddress nodes into LO/HI parts.
- setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom);
- setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom);
- setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom);
- setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom);
+ setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+ setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+ setOperationAction(ISD::BlockAddress, PtrVT, Custom);
// Sparc doesn't have sext_inreg, replace them with shl/sra
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1579,9 +1678,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
- setExceptionPointerRegister(SP::I0);
- setExceptionSelectorRegister(SP::I1);
-
setStackPointerRegisterToSaveRestore(SP::O6);
setOperationAction(ISD::CTPOP, MVT::i32,
@@ -1744,18 +1840,15 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
ISD::CondCode CC, unsigned &SPCC) {
- if (isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->isNullValue() &&
+ if (isNullConstant(RHS) &&
CC == ISD::SETNE &&
(((LHS.getOpcode() == SPISD::SELECT_ICC ||
LHS.getOpcode() == SPISD::SELECT_XCC) &&
LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
(LHS.getOpcode() == SPISD::SELECT_FCC &&
LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
- isa<ConstantSDNode>(LHS.getOperand(0)) &&
- isa<ConstantSDNode>(LHS.getOperand(1)) &&
- cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
- cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
+ isOneConstant(LHS.getOperand(0)) &&
+ isNullConstant(LHS.getOperand(1))) {
SDValue CMPCC = LHS.getOperand(3);
SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
LHS = CMPCC.getOperand(0);
@@ -1821,7 +1914,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
MFI->setHasCalls(true);
return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
}
// This is one of the absolute code models.
@@ -1872,6 +1966,9 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
SDLoc DL(GA);
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2601,6 +2698,17 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
return DAG.getMergeValues(Ops, dl);
}
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+ LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+ EVT MemVT = LdNode->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return LowerF128Load(Op, DAG);
+
+ return Op;
+}
+
// Lower a f128 store into two f64 stores.
static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -2645,6 +2753,29 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+ SDLoc dl(Op);
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+
+ EVT MemVT = St->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return LowerF128Store(Op, DAG);
+
+ if (MemVT == MVT::i64) {
+ // Custom handling for i64 stores: turn it into a bitcast and a
+ // v2i32 store.
+ SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
+ SDValue Chain = DAG.getStore(
+ St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
+ St->isVolatile(), St->isNonTemporal(), St->getAlignment(),
+ St->getAAInfo());
+ return Chain;
+ }
+
+ return SDValue();
+}
+
static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
&& "invalid opcode");
@@ -2752,7 +2883,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
SDValue MulResult = TLI.makeLibCall(DAG,
RTLIB::MUL_I128, WideVT,
- Args, 4, isSigned, dl).first;
+ Args, isSigned, dl).first;
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
MulResult, DAG.getIntPtrConstant(0, dl));
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
@@ -2783,7 +2914,6 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
return SDValue();
}
-
SDValue SparcTargetLowering::
LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -2818,8 +2948,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
Subtarget);
- case ISD::LOAD: return LowerF128Load(Op, DAG);
- case ISD::STORE: return LowerF128Store(Op, DAG);
+ case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::FADD: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::ADD_F128), 2);
case ISD::FSUB: return LowerF128Op(Op, DAG,
@@ -2921,8 +3051,7 @@ SparcTargetLowering::expandSelectCC(MachineInstr *MI,
// to set, the condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -3007,7 +3136,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
.addReg(AddrReg).addImm(0);
// Split the basic block MBB before MI and insert the loop block in the hole.
- MachineFunction::iterator MFI = MBB;
+ MachineFunction::iterator MFI = MBB->getIterator();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineFunction *MF = MBB->getParent();
MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -3149,9 +3278,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
- return std::make_pair(0U, &SP::IntRegsRegClass);
+ if (VT == MVT::v2i32)
+ return std::make_pair(0U, &SP::IntPairRegClass);
+ else
+ return std::make_pair(0U, &SP::IntRegsRegClass);
}
- } else if (!Constraint.empty() && Constraint.size() <= 5
+ } else if (!Constraint.empty() && Constraint.size() <= 5
&& Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
// constraint = '{r<d>}'
// Remove the braces from around the name.
@@ -3227,5 +3359,24 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
getLibcallName(libCall),
1));
return;
+ case ISD::LOAD: {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ // Custom handling only for i64: turn i64 load into a v2i32 load,
+ // and a bitcast.
+ if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
+ return;
+
+ SDLoc dl(N);
+ SDValue LoadRes = DAG.getExtLoad(
+ Ld->getExtensionType(), dl, MVT::v2i32,
+ Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+ MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo());
+
+ SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
+ Results.push_back(Res);
+ Results.push_back(LoadRes.getValue(1));
+ return;
+ }
}
}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index bbc91a493c9d..4e46709cfc09 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -89,6 +89,20 @@ namespace llvm {
return MVT::i32;
}
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return SP::I0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return SP::I1;
+ }
+
/// getSetCCResultType - Return the ISD::SETCC ValueType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -167,8 +181,8 @@ namespace llvm {
}
void ReplaceNodeResults(SDNode *N,
- SmallVectorImpl<SDValue>& Results,
- SelectionDAG &DAG) const override;
+ SmallVectorImpl<SDValue>& Results,
+ SelectionDAG &DAG) const override;
MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
unsigned BROpcode) const;
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 25cc652dbd9e..d51e2ccc8a35 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -250,6 +250,7 @@ defm : int_cond_alias<"n", 0b0000>;
defm : int_cond_alias<"ne", 0b1001>;
defm : int_cond_alias<"nz", 0b1001>; // same as ne
defm : int_cond_alias<"e", 0b0001>;
+defm : int_cond_alias<"eq", 0b0001>; // same as e
defm : int_cond_alias<"z", 0b0001>; // same as e
defm : int_cond_alias<"g", 0b1010>;
defm : int_cond_alias<"le", 0b0010>;
@@ -429,6 +430,9 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
def : InstAlias<"flush", (FLUSH), 0>;
+def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
+def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
+
def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
@@ -450,3 +454,8 @@ def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
QFPRegs:$rs2)>,
Requires<[HasHardQuad]>;
+// signx rd -> sra rd, %g0, rd
+def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>;
+
+// signx reg, rd -> sra reg, %g0, rd
+def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 6167c532db80..733027a5d2be 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -284,7 +284,9 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned numSubRegs = 0;
unsigned movOpc = 0;
const unsigned *subRegIdx = nullptr;
+ bool ExtraG0 = false;
+ const unsigned DW_SubRegsIdx[] = { SP::sub_even, SP::sub_odd };
const unsigned DFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd };
const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
const unsigned QFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd,
@@ -294,7 +296,12 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
.addReg(SrcReg, getKillRegState(KillSrc));
- else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+ else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) {
+ subRegIdx = DW_SubRegsIdx;
+ numSubRegs = 2;
+ movOpc = SP::ORrr;
+ ExtraG0 = true;
+ } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
@@ -347,7 +354,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]);
assert(Dst && Src && "Bad sub-register");
- MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
+ if (ExtraG0)
+ MIB.addReg(SP::G0);
+ MIB.addReg(Src);
+ MovMI = MIB.getInstr();
}
// Add implicit super-register defs and kills to the last MovMI.
MovMI->addRegisterDefined(DestReg, TRI);
@@ -365,19 +376,20 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = *MF->getFrameInfo();
- MachineMemOperand *MMO =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
// On the order of operands here: think "[FrameIdx + 0] = SrcReg".
- if (RC == &SP::I64RegsRegClass)
+ if (RC == &SP::I64RegsRegClass)
BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
.addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
else if (RC == &SP::IntRegsRegClass)
BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
.addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+ else if (RC == &SP::IntPairRegClass)
+ BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
else if (RC == &SP::FPRegsRegClass)
BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
.addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
@@ -403,11 +415,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = *MF->getFrameInfo();
- MachineMemOperand *MMO =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
if (RC == &SP::I64RegsRegClass)
BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
@@ -415,6 +425,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
else if (RC == &SP::IntRegsRegClass)
BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
.addMemOperand(MMO);
+ else if (RC == &SP::IntPairRegClass)
+ BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0)
+ .addMemOperand(MMO);
else if (RC == &SP::FPRegsRegClass)
BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
.addMemOperand(MMO);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 3b9e048ea8b3..ec37c22a5b33 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -283,17 +283,32 @@ multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
[(set Ty:$dst, (OpNode ADDRri:$addr))]>;
}
+// TODO: Instructions of the LoadASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+ RegisterClass RC, ValueType Ty> :
+ F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
+ !strconcat(OpcStr, "a [$addr] $asi, $dst"),
+ []>;
+
// LoadA multiclass - As above, but also define alternate address space variant
multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
Load<OpcStr, Op3Val, OpNode, RC, Ty> {
- // TODO: The LD*Arr instructions are currently asm only; hooking up
- // CodeGen's address spaces to use these is a future task.
- def Arr : F3_1_asi<3, LoadAOp3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
- !strconcat(OpcStr, "a [$addr] $asi, $dst"),
- []>;
+ def Arr : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
}
+// The LDSTUB instruction is supported for asm only.
+// It is unlikely that general-purpose code could make use of it.
+// CAS is preferred for sparc v9.
+def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr),
+ "ldstub [$addr], $dst", []>;
+def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr),
+ "ldstub [$addr], $dst", []>;
+def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst),
+ (ins MEMrr:$addr, i8imm:$asi),
+ "ldstuba [$addr] $asi, $dst", []>;
+
// Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
RegisterClass RC, ValueType Ty> {
@@ -307,14 +322,18 @@ multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
[(OpNode Ty:$rd, ADDRri:$addr)]>;
}
-multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+// TODO: Instructions of the StoreASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class StoreASI<string OpcStr, bits<6> Op3Val,
SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
- Store<OpcStr, Op3Val, OpNode, RC, Ty> {
- // TODO: The ST*Arr instructions are currently asm only; hooking up
- // CodeGen's address spaces to use these is a future task.
- def Arr : F3_1_asi<3, StoreAOp3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
+ F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
!strconcat(OpcStr, "a $rd, [$addr] $asi"),
[]>;
+
+multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+ SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
+ Store<OpcStr, Op3Val, OpNode, RC, Ty> {
+ def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty>;
}
//===----------------------------------------------------------------------===//
@@ -408,15 +427,40 @@ let DecoderMethod = "DecodeLoadInt" in {
defm LD : LoadA<"ld", 0b000000, 0b010000, load, IntRegs, i32>;
}
+let DecoderMethod = "DecodeLoadIntPair" in
+ defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>;
+
// Section B.2 - Load Floating-point Instructions, p. 92
-let DecoderMethod = "DecodeLoadFP" in
- defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>;
-let DecoderMethod = "DecodeLoadDFP" in
- defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>;
+let DecoderMethod = "DecodeLoadFP" in {
+ defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>;
+ def LDFArr : LoadASI<"ld", 0b110000, load, FPRegs, f32>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadDFP" in {
+ defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>;
+ def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
+ Requires<[HasV9]>;
+}
let DecoderMethod = "DecodeLoadQFP" in
- defm LDQF : Load<"ldq", 0b100010, load, QFPRegs, f128>,
+ defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
Requires<[HasV9, HasHardQuad]>;
+let DecoderMethod = "DecodeLoadFP" in
+ let Defs = [FSR] in {
+ let rd = 0 in {
+ def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+ "ld [$addr], %fsr", []>;
+ def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+ "ld [$addr], %fsr", []>;
+ }
+ let rd = 1 in {
+ def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+ "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+ def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+ "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+ }
+ }
+
// Section B.4 - Store Integer Instructions, p. 95
let DecoderMethod = "DecodeStoreInt" in {
defm STB : StoreA<"stb", 0b000101, 0b010101, truncstorei8, IntRegs, i32>;
@@ -424,15 +468,40 @@ let DecoderMethod = "DecodeStoreInt" in {
defm ST : StoreA<"st", 0b000100, 0b010100, store, IntRegs, i32>;
}
+let DecoderMethod = "DecodeStoreIntPair" in
+ defm STD : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>;
+
// Section B.5 - Store Floating-point Instructions, p. 97
-let DecoderMethod = "DecodeStoreFP" in
+let DecoderMethod = "DecodeStoreFP" in {
defm STF : Store<"st", 0b100100, store, FPRegs, f32>;
-let DecoderMethod = "DecodeStoreDFP" in
- defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>;
+ def STFArr : StoreASI<"st", 0b110100, store, FPRegs, f32>,
+ Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreDFP" in {
+ defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>;
+ def STDFArr : StoreASI<"std", 0b110111, store, DFPRegs, f64>,
+ Requires<[HasV9]>;
+}
let DecoderMethod = "DecodeStoreQFP" in
- defm STQF : Store<"stq", 0b100110, store, QFPRegs, f128>,
+ defm STQF : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
Requires<[HasV9, HasHardQuad]>;
+let DecoderMethod = "DecodeStoreFP" in
+ let Defs = [FSR] in {
+ let rd = 0 in {
+ def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+ "st %fsr, [$addr]", []>;
+ def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+ "st %fsr, [$addr]", []>;
+ }
+ let rd = 1 in {
+ def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+ "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+ def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+ "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+ }
+ }
+
// Section B.8 - SWAP Register with Memory Instruction
// (Atomic swap)
let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
@@ -559,6 +628,10 @@ let Defs = [Y, ICC] in {
defm SMULCC : F3_12np<"smulcc", 0b011011>;
}
+let Defs = [Y, ICC], Uses = [Y, ICC] in {
+ defm MULSCC : F3_12np<"mulscc", 0b100100>;
+}
+
// Section B.19 - Divide Instructions, p. 115
let Uses = [Y], Defs = [Y] in {
defm UDIV : F3_12np<"udiv", 0b001110>;
@@ -1221,8 +1294,8 @@ let Predicates = [HasV9] in {
// the top 32-bits before using it. To do this clearing, we use a SRLri X,0.
let rs1 = 0 in
def POPCrr : F3_1<2, 0b101110,
- (outs IntRegs:$dst), (ins IntRegs:$src),
- "popc $src, $dst", []>, Requires<[HasV9]>;
+ (outs IntRegs:$rd), (ins IntRegs:$rs2),
+ "popc $rs2, $rd", []>, Requires<[HasV9]>;
def : Pat<(ctpop i32:$src),
(POPCrr (SRLri $src, 0))>;
@@ -1254,6 +1327,25 @@ let hasSideEffects = 1 in {
}
}
+
+// Section A.43 - Read Privileged Register Instructions
+let Predicates = [HasV9] in {
+let rs2 = 0 in
+ def RDPR : F3_1<2, 0b101010,
+ (outs IntRegs:$rd), (ins PRRegs:$rs1),
+ "rdpr $rs1, $rd", []>;
+}
+
+// Section A.62 - Write Privileged Register Instructions
+let Predicates = [HasV9] in {
+ def WRPRrr : F3_1<2, 0b110010,
+ (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+ "wrpr $rs1, $rs2, $rd", []>;
+ def WRPRri : F3_2<2, 0b110010,
+ (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+ "wrpr $rs1, $simm13, $rd", []>;
+}
+
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//===----------------------------------------------------------------------===//
@@ -1327,6 +1419,18 @@ def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+// extract_vector
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 0),
+ (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>;
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 1),
+ (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>;
+
+// build_vector
+def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even),
+ (i32 IntRegs:$a2), sub_odd)>;
+
include "SparcInstr64Bit.td"
include "SparcInstrVIS.td"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 9667bc059f18..da31783ba248 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -75,6 +75,18 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(SP::G6);
Reserved.set(SP::G7);
+ // Also reserve the register pair aliases covering the above
+ // registers, with the same conditions.
+ Reserved.set(SP::G0_G1);
+ if (ReserveAppRegisters)
+ Reserved.set(SP::G2_G3);
+ if (ReserveAppRegisters || !Subtarget.is64Bit())
+ Reserved.set(SP::G4_G5);
+
+ Reserved.set(SP::O6_O7);
+ Reserved.set(SP::I6_I7);
+ Reserved.set(SP::G6_G7);
+
// Unaliased double registers are not available in non-V9 targets.
if (!Subtarget.isV9()) {
for (unsigned n = 0; n != 16; ++n) {
@@ -158,21 +170,15 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr &MI = *II;
DebugLoc dl = MI.getDebugLoc();
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-
- // Addressable stack objects are accessed using neg. offsets from %fp
MachineFunction &MF = *MI.getParent()->getParent();
const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
- int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
- MI.getOperand(FIOperandNum + 1).getImm() +
- Subtarget.getStackPointerBias();
- SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
- unsigned FramePtr = SP::I6;
- if (FuncInfo->isLeafProc()) {
- // Use %sp and adjust offset if needed.
- FramePtr = SP::O6;
- int stackSize = MF.getFrameInfo()->getStackSize();
- Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
- }
+ const SparcFrameLowering *TFI = getFrameLowering(MF);
+
+ unsigned FrameReg;
+ int Offset;
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+
+ Offset += MI.getOperand(FIOperandNum + 1).getImm();
if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
if (MI.getOpcode() == SP::STQFri) {
@@ -182,8 +188,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64);
MachineInstr *StMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
- .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
- replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+ .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
+ replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
MI.setDesc(TII.get(SP::STDFri));
MI.getOperand(2).setReg(SrcOddReg);
Offset += 8;
@@ -194,8 +200,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64);
MachineInstr *StMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
- .addReg(FramePtr).addImm(0);
- replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+ .addReg(FrameReg).addImm(0);
+ replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
MI.setDesc(TII.get(SP::LDDFri));
MI.getOperand(0).setReg(DestOddReg);
@@ -203,7 +209,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
- replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+ replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
}
@@ -211,3 +217,25 @@ unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return SP::I6;
}
+// Sparc has no architectural need for stack realignment support,
+// except that LLVM unfortunately currently implements overaligned
+// stack objects by depending upon stack realignment support.
+// If that ever changes, this can probably be deleted.
+bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ if (!TargetRegisterInfo::canRealignStack(MF))
+ return false;
+
+ // Sparc always has a fixed frame pointer register, so don't need to
+ // worry about needing to reserve it. [even if we don't have a frame
+ // pointer for our frame, it still cannot be used for other things,
+ // or register window traps will be SADNESS.]
+
+ // If there's a reserved call frame, we can use SP to access locals.
+ if (getFrameLowering(MF)->hasReservedCallFrame(MF))
+ return true;
+
+ // Otherwise, we'd need a base pointer, but those aren't implemented
+ // for SPARC at the moment.
+
+ return false;
+}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 764a894fe9a3..32075b1df410 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -42,8 +42,10 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS = nullptr) const;
- // Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ bool canRealignStack(const MachineFunction &MF) const override;
+
};
} // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index db8a7e86962d..cca9463562a4 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -32,6 +32,12 @@ def sub_odd64 : SubRegIndex<64, 64>;
// Ri - 32-bit integer registers
class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+// Rdi - pairs of 32-bit integer registers
+class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+ let SubRegs = subregs;
+ let SubRegIndices = [sub_even, sub_odd];
+ let CoveredBySubRegs = 1;
+}
// Rf - 32-bit floating-point registers
class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
@@ -54,6 +60,8 @@ def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
foreach I = 0-3 in
def FCC#I : SparcCtrlReg<I, "FCC"#I>;
+def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register.
+
// Y register
def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
// Ancillary state registers (implementation defined)
@@ -94,6 +102,22 @@ def PSR : SparcCtrlReg<0, "PSR">;
def WIM : SparcCtrlReg<0, "WIM">;
def TBR : SparcCtrlReg<0, "TBR">;
+def TPC : SparcCtrlReg<0, "TPC">;
+def TNPC : SparcCtrlReg<1, "TNPC">;
+def TSTATE : SparcCtrlReg<2, "TSTATE">;
+def TT : SparcCtrlReg<3, "TT">;
+def TICK : SparcCtrlReg<4, "TICK">;
+def TBA : SparcCtrlReg<5, "TBA">;
+def PSTATE : SparcCtrlReg<6, "PSTATE">;
+def TL : SparcCtrlReg<7, "TL">;
+def PIL : SparcCtrlReg<8, "PIL">;
+def CWP : SparcCtrlReg<9, "CWP">;
+def CANSAVE : SparcCtrlReg<10, "CANSAVE">;
+def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">;
+def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">;
+def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">;
+def WSTATE : SparcCtrlReg<14, "WSTATE">;
+
// Integer registers
def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
@@ -217,6 +241,24 @@ def Q13 : Rq<21, "F52", [D26, D27]>;
def Q14 : Rq<25, "F56", [D28, D29]>;
def Q15 : Rq<29, "F60", [D30, D31]>;
+// Aliases of the integer registers used for LDD/STD double-word operations
+def G0_G1 : Rdi<0, "G0", [G0, G1]>;
+def G2_G3 : Rdi<2, "G2", [G2, G3]>;
+def G4_G5 : Rdi<4, "G4", [G4, G5]>;
+def G6_G7 : Rdi<6, "G6", [G6, G7]>;
+def O0_O1 : Rdi<8, "O0", [O0, O1]>;
+def O2_O3 : Rdi<10, "O2", [O2, O3]>;
+def O4_O5 : Rdi<12, "O4", [O4, O5]>;
+def O6_O7 : Rdi<14, "O6", [O6, O7]>;
+def L0_L1 : Rdi<16, "L0", [L0, L1]>;
+def L2_L3 : Rdi<18, "L2", [L2, L3]>;
+def L4_L5 : Rdi<20, "L4", [L4, L5]>;
+def L6_L7 : Rdi<22, "L6", [L6, L7]>;
+def I0_I1 : Rdi<24, "I0", [I0, I1]>;
+def I2_I3 : Rdi<26, "I2", [I2, I3]>;
+def I4_I5 : Rdi<28, "I4", [I4, I5]>;
+def I6_I7 : Rdi<30, "I6", [I6, I7]>;
+
// Register classes.
//
// FIXME: the register order should be defined in terms of the preferred
@@ -231,6 +273,13 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32,
(sequence "L%u", 0, 7),
(sequence "O%u", 0, 7))>;
+// Should be in the same order as IntRegs.
+def IntPair : RegisterClass<"SP", [v2i32], 64,
+ (add I0_I1, I2_I3, I4_I5, I6_I7,
+ G0_G1, G2_G3, G4_G5, G6_G7,
+ L0_L1, L2_L3, L4_L5, L6_L7,
+ O0_O1, O2_O3, O4_O5, O6_O7)>;
+
// Register class for 64-bit mode, with a 64-bit spill slot size.
// These are the same as the 32-bit registers, so TableGen will consider this
// to be a sub-class of IntRegs. That works out because requiring a 64-bit
@@ -252,3 +301,8 @@ def ASRRegs : RegisterClass<"SP", [i32], 32,
(add Y, (sequence "ASR%u", 1, 31))> {
let isAllocatable = 0;
}
+
+// Privileged Registers
+def PRRegs : RegisterClass<"SP", [i64], 64,
+ (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP,
+ CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>;
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index d69da409e428..d701594d27af 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -64,7 +64,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
frameSize += 128;
// Frames with calls must also reserve space for 6 outgoing arguments
// whether they are used or not. LowerCall_64 takes care of that.
- assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned");
+ frameSize = RoundUpToAlignment(frameSize, 16);
} else {
// Emit the correct save instruction based on the number of bytes in
// the frame. Minimum stack frame size according to V8 ABI is:
@@ -81,3 +81,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
}
return frameSize;
}
+
+bool SparcSubtarget::enableMachineScheduler() const {
+ return true;
+}
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 9d21911d88f0..e2fd2f04528a 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -60,6 +60,8 @@ public:
return &TSInfo;
}
+ bool enableMachineScheduler() const override;
+
bool isV9() const { return IsV9; }
bool isVIS() const { return IsVIS; }
bool isVIS2() const { return IsVIS2; }
@@ -85,7 +87,6 @@ public:
/// returns adjusted framesize which includes space for register window
/// spills and arguments.
int getAdjustedFrameSize(int stackSize) const;
-
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3aa4c6bd32d6..9c995bf42b0b 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -349,7 +349,6 @@ class SystemZAsmParser : public MCTargetAsmParser {
#include "SystemZGenAsmMatcher.inc"
private:
- MCSubtargetInfo &STI;
MCAsmParser &Parser;
enum RegisterGroup {
RegGR,
@@ -386,14 +385,14 @@ private:
bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
public:
- SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+ SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(sti), Parser(parser) {
+ : MCTargetAsmParser(Options, sti), Parser(parser) {
MCAsmParserExtension::Initialize(Parser);
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
// Override MCTargetAsmParser.
@@ -533,14 +532,16 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
}
// Parse a register of group Group. If Regs is nonnull, use it to map
-// the raw register number to LLVM numbering, with zero entries indicating
-// an invalid register. IsAddress says whether the register appears in an
-// address context.
+// the raw register number to LLVM numbering, with zero entries
+// indicating an invalid register. IsAddress says whether the
+// register appears in an address context. Allow FP Group if expecting
+// RegV Group, since the f-prefix yields the FP group even while used
+// with vector instructions.
bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
const unsigned *Regs, bool IsAddress) {
if (parseRegister(Reg))
return true;
- if (Reg.Group != Group)
+ if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV))
return Error(Reg.StartLoc, "invalid operand for instruction");
if (Regs && Regs[Reg.Num] == 0)
return Error(Reg.StartLoc, "invalid register pair");
@@ -791,7 +792,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (MatchResult) {
case Match_Success:
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature: {
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 059ae3f7fb09..6444cf8e464d 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -60,15 +60,15 @@ void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
O << '%' << getRegisterName(RegNo);
}
-template<unsigned N>
-void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+template <unsigned N>
+static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
int64_t Value = MI->getOperand(OpNum).getImm();
assert(isUInt<N>(Value) && "Invalid uimm argument");
O << Value;
}
-template<unsigned N>
-void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+template <unsigned N>
+static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
int64_t Value = MI->getOperand(OpNum).getImm();
assert(isInt<N>(Value) && "Invalid simm argument");
O << Value;
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index ba55e686f3ef..7ca386fc4cb9 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,7 +15,6 @@
#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/Compiler.h"
namespace llvm {
class MCOperand;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 5fefa315a4cf..2115d4480eef 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -226,7 +226,7 @@ extern "C" void LLVMInitializeSystemZTargetMC() {
// Register the MCCodeEmitter.
TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget,
- createSystemZMCCodeEmitter);
+ createSystemZMCCodeEmitter);
// Register the MCInstrInfo.
TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index e089047d013e..cd367d60bab7 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -52,12 +52,6 @@ We don't use the TEST DATA CLASS instructions.
--
-We could use the generic floating-point forms of LOAD COMPLEMENT,
-LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the
-condition codes. For example, we could use LCDFR instead of LCDBR.
-
---
-
We only use MVC, XC and CLC for constant-length block operations.
We could extend them to variable-length operations too,
using EXECUTE RELATIVE LONG.
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 3dca7bd89f05..75273114d62f 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -288,7 +288,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()),
getModifierVariantKind(ZCPV->getModifier()),
OutContext);
- uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
+ uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType());
OutStreamer->EmitValue(Expr, Size);
}
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index 44ea1d25f08e..4a6beb67f182 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -26,21 +26,6 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV,
return new SystemZConstantPoolValue(GV, Modifier);
}
-unsigned SystemZConstantPoolValue::getRelocationInfo() const {
- switch (Modifier) {
- case SystemZCP::TLSGD:
- case SystemZCP::TLSLDM:
- case SystemZCP::DTPOFF:
- // May require a dynamic relocation.
- return 2;
- case SystemZCP::NTPOFF:
- // May require a relocation, but the relocations are always resolved
- // by the static linker.
- return 1;
- }
- llvm_unreachable("Unknown modifier");
-}
-
int SystemZConstantPoolValue::
getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
unsigned AlignMask = Alignment - 1;
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
index e5f1bb18581b..a71b595560d2 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -43,7 +43,6 @@ public:
Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
// Override MachineConstantPoolValue.
- unsigned getRelocationInfo() const override;
int getExistingMachineCPValue(MachineConstantPool *CP,
unsigned Alignment) override;
void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 16f9adc79f17..4818ed015522 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -37,13 +37,11 @@ namespace {
// instructions.
struct Reference {
Reference()
- : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+ : Def(false), Use(false) {}
Reference &operator|=(const Reference &Other) {
Def |= Other.Def;
- IndirectDef |= Other.IndirectDef;
Use |= Other.Use;
- IndirectUse |= Other.IndirectUse;
return *this;
}
@@ -53,11 +51,6 @@ struct Reference {
// via a sub- or super-register.
bool Def;
bool Use;
-
- // True if the register is defined or used indirectly, by a sub- or
- // super-register.
- bool IndirectDef;
- bool IndirectUse;
};
class SystemZElimCompare : public MachineFunctionPass {
@@ -104,14 +97,12 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) {
return false;
}
-// Return true if any CC result of MI would reflect the value of subreg
-// SubReg of Reg.
-static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
+// Return true if any CC result of MI would reflect the value of Reg.
+static bool resultTests(MachineInstr *MI, unsigned Reg) {
if (MI->getNumOperands() > 0 &&
MI->getOperand(0).isReg() &&
MI->getOperand(0).isDef() &&
- MI->getOperand(0).getReg() == Reg &&
- MI->getOperand(0).getSubReg() == SubReg)
+ MI->getOperand(0).getReg() == Reg)
return true;
switch (MI->getOpcode()) {
@@ -127,30 +118,25 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
case SystemZ::LTEBR:
case SystemZ::LTDBR:
case SystemZ::LTXBR:
- if (MI->getOperand(1).getReg() == Reg &&
- MI->getOperand(1).getSubReg() == SubReg)
+ if (MI->getOperand(1).getReg() == Reg)
return true;
}
return false;
}
-// Describe the references to Reg in MI, including sub- and super-registers.
+// Describe the references to Reg or any of its aliases in MI.
Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
Reference Ref;
for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
const MachineOperand &MO = MI->getOperand(I);
if (MO.isReg()) {
if (unsigned MOReg = MO.getReg()) {
- if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
- if (MO.isUse()) {
+ if (TRI->regsOverlap(MOReg, Reg)) {
+ if (MO.isUse())
Ref.Use = true;
- Ref.IndirectUse |= (MOReg != Reg);
- }
- if (MO.isDef()) {
+ else if (MO.isDef())
Ref.Def = true;
- Ref.IndirectDef |= (MOReg != Reg);
- }
}
}
}
@@ -158,6 +144,30 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
return Ref;
}
+// Return true if this is a load and test which can be optimized the
+// same way as compare instruction.
+static bool isLoadAndTestAsCmp(MachineInstr *MI) {
+ // If we during isel used a load-and-test as a compare with 0, the
+ // def operand is dead.
+ return ((MI->getOpcode() == SystemZ::LTEBR ||
+ MI->getOpcode() == SystemZ::LTDBR ||
+ MI->getOpcode() == SystemZ::LTXBR) &&
+ MI->getOperand(0).isDead());
+}
+
+// Return the source register of Compare, which is the unknown value
+// being tested.
+static unsigned getCompareSourceReg(MachineInstr *Compare) {
+ unsigned reg = 0;
+ if (Compare->isCompare())
+ reg = Compare->getOperand(0).getReg();
+ else if (isLoadAndTestAsCmp(Compare))
+ reg = Compare->getOperand(1).getReg();
+ assert (reg);
+
+ return reg;
+}
+
// Compare compares the result of MI against zero. If MI is an addition
// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
// and convert the branch to a BRCT(G). Return true on success.
@@ -188,7 +198,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
// We already know that there are no references to the register between
// MI and Compare. Make sure that there are also no references between
// Compare and Branch.
- unsigned SrcReg = Compare->getOperand(0).getReg();
+ unsigned SrcReg = getCompareSourceReg(Compare);
MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
for (++MBBI; MBBI != MBBE; ++MBBI)
if (getRegReferences(MBBI, SrcReg))
@@ -196,16 +206,15 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
// The transformation is OK. Rebuild Branch as a BRCT(G).
MachineOperand Target(Branch->getOperand(2));
- Branch->RemoveOperand(2);
- Branch->RemoveOperand(1);
- Branch->RemoveOperand(0);
+ while (Branch->getNumOperands())
+ Branch->RemoveOperand(0);
Branch->setDesc(TII->get(BRCT));
MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(1))
.addOperand(Target)
.addReg(SystemZ::CC, RegState::ImplicitDefine);
- MI->removeFromParent();
+ MI->eraseFromParent();
return true;
}
@@ -308,6 +317,10 @@ static bool isCompareZero(MachineInstr *Compare) {
return true;
default:
+
+ if (isLoadAndTestAsCmp(Compare))
+ return true;
+
return (Compare->getNumExplicitOperands() == 2 &&
Compare->getOperand(1).isImm() &&
Compare->getOperand(1).getImm() == 0);
@@ -325,8 +338,7 @@ optimizeCompareZero(MachineInstr *Compare,
return false;
// Search back for CC results that are based on the first operand.
- unsigned SrcReg = Compare->getOperand(0).getReg();
- unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
+ unsigned SrcReg = getCompareSourceReg(Compare);
MachineBasicBlock &MBB = *Compare->getParent();
MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
Reference CCRefs;
@@ -334,7 +346,7 @@ optimizeCompareZero(MachineInstr *Compare,
while (MBBI != MBBE) {
--MBBI;
MachineInstr *MI = MBBI;
- if (resultTests(MI, SrcReg, SrcSubReg)) {
+ if (resultTests(MI, SrcReg)) {
// Try to remove both MI and Compare by converting a branch to BRCT(G).
// We don't care in this case whether CC is modified between MI and
// Compare.
@@ -435,23 +447,21 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
while (MBBI != MBB.begin()) {
MachineInstr *MI = --MBBI;
if (CompleteCCUsers &&
- MI->isCompare() &&
+ (MI->isCompare() || isLoadAndTestAsCmp(MI)) &&
(optimizeCompareZero(MI, CCUsers) ||
fuseCompareAndBranch(MI, CCUsers))) {
++MBBI;
- MI->removeFromParent();
+ MI->eraseFromParent();
Changed = true;
CCUsers.clear();
- CompleteCCUsers = true;
continue;
}
- Reference CCRefs(getRegReferences(MI, SystemZ::CC));
- if (CCRefs.Def) {
+ if (MI->definesRegister(SystemZ::CC)) {
CCUsers.clear();
- CompleteCCUsers = !CCRefs.IndirectDef;
+ CompleteCCUsers = true;
}
- if (CompleteCCUsers && CCRefs.Use)
+ if (MI->readsRegister(SystemZ::CC) && CompleteCCUsers)
CCUsers.push_back(MI);
}
return Changed;
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 397de472a6ee..e1b20d0536d1 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -48,7 +48,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
SystemZFrameLowering::SystemZFrameLowering()
: TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
- -SystemZMC::CallFrameSize, 8) {
+ -SystemZMC::CallFrameSize, 8,
+ false /* StackRealignable */) {
// Create a mapping from register number to save slot offset.
RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
@@ -133,7 +134,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
bool IsVarArg = MF.getFunction()->isVarArg();
- DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+ DebugLoc DL;
// Scan the call-saved GPRs and find the bounds of the register spill area.
unsigned LowGPR = 0;
@@ -322,7 +323,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
bool HasFP = hasFP(MF);
- DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
// The current offset of the stack pointer from the CFA.
int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
@@ -394,7 +398,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
// Add CFI for the this save.
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx());
+ unsigned IgnoredFrameReg;
+ int64_t Offset =
+ getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+
unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
nullptr, DwarfReg, SPOffsetFromCFA + Offset));
CFIIndexes.push_back(CFIIndex);
@@ -455,9 +462,14 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
}
-int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
+int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+ // Fill in FrameReg output argument.
+ FrameReg = RI->getFrameRegister(MF);
// Start with the offset of FI from the top of the caller-allocated frame
// (i.e. the top of the 160 bytes allocated by the caller). This initial
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 5ade757f17f7..46bb6b7a7573 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -43,7 +43,8 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
bool hasFP(const MachineFunction &MF) const override;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
void eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 75fd37f01a19..a9093094d884 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -585,7 +585,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
if (N.getNode()->getNodeId() == -1 ||
N.getNode()->getNodeId() > Pos->getNodeId()) {
- DAG->RepositionNode(Pos, N.getNode());
+ DAG->RepositionNode(Pos->getIterator(), N.getNode());
N.getNode()->setNodeId(Pos->getNodeId());
}
}
@@ -801,7 +801,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
RxSBG.Input = N.getOperand(0);
return true;
}
-
+
case ISD::ANY_EXTEND:
// Bits above the extended operand are don't-care.
RxSBG.Input = N.getOperand(0);
@@ -818,7 +818,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
return true;
}
// Fall through.
-
+
case ISD::SIGN_EXTEND: {
// Check that the extension bits are don't-care (i.e. are masked out
// by the final mask).
@@ -938,7 +938,23 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
}
return nullptr;
}
- }
+ }
+
+ // If the RISBG operands require no rotation and just masks the bottom
+ // 8/16 bits, attempt to convert this to a LLC zero extension.
+ if (RISBG.Rotate == 0 && (RISBG.Mask == 0xff || RISBG.Mask == 0xffff)) {
+ unsigned OpCode = (RISBG.Mask == 0xff ? SystemZ::LLGCR : SystemZ::LLGHR);
+ if (VT == MVT::i32) {
+ if (Subtarget->hasHighWord())
+ OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCRMux : SystemZ::LLHRMux);
+ else
+ OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCR : SystemZ::LLHR);
+ }
+
+ SDValue In = convertTo(DL, VT, RISBG.Input);
+ N = CurDAG->getMachineNode(OpCode, DL, VT, In);
+ return convertTo(DL, VT, SDValue(N, 0)).getNode();
+ }
unsigned Opcode = SystemZ::RISBG;
// Prefer RISBGN if available, since it does not clobber CC.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 9a753c897519..ee732675fb39 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -84,8 +84,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
const SystemZSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
- auto &DL = *TM.getDataLayout();
- MVT PtrVT = getPointerTy(DL);
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
// Set up the register classes.
if (Subtarget.hasHighWord())
@@ -115,8 +114,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
computeRegisterProperties(Subtarget.getRegisterInfo());
// Set up special registers.
- setExceptionPointerRegister(SystemZ::R6D);
- setExceptionSelectorRegister(SystemZ::R7D);
setStackPointerRegisterToSaveRestore(SystemZ::R15D);
// TODO: It may be better to default to latency-oriented scheduling, however
@@ -370,7 +367,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// No special instructions for these.
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
}
}
@@ -776,9 +775,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
}
bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
- if (!CI->isTailCall())
- return false;
- return true;
+ return CI->isTailCall();
}
// We do not yet support 128-bit single-element vector types. If the user
@@ -939,8 +936,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getIntPtrConstant(4, DL));
ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ MachinePointerInfo::getFixedStack(MF, FI), false,
+ false, false, 0);
}
// Convert the value of the argument register into the value that's
@@ -976,9 +973,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
&SystemZ::FP64BitRegClass);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
- MachinePointerInfo::getFixedStack(FI),
+ MachinePointerInfo::getFixedStack(MF, FI),
false, false, 0);
-
}
// Join the stores, which are independent of one another.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
@@ -1060,9 +1056,9 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Store the argument in a stack slot and pass its address.
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
- MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
+ MemOpChains.push_back(DAG.getStore(
+ Chain, DL, ArgValue, SpillSlot,
+ MachinePointerInfo::getFixedStack(MF, FI), false, false, 0));
ArgValue = SpillSlot;
} else
ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
@@ -1607,8 +1603,8 @@ static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
} else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
if (Value > Mask)
return;
- assert(C.ICmpType == SystemZICMP::Any &&
- "Signedness shouldn't matter here.");
+ // If the constant is in range, we can use any comparison.
+ C.ICmpType = SystemZICMP::Any;
} else
return;
@@ -2439,7 +2435,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
}
// If there was a non-zero offset that we didn't fold, create an explicit
@@ -2499,7 +2496,9 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
}
SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(Node, DAG);
SDLoc DL(Node);
const GlobalValue *GV = Node->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -2529,9 +2528,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
- Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
- Offset, MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
// Call __tls_get_offset to retrieve the offset.
Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
@@ -2544,9 +2544,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
- Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
- Offset, MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
// Call __tls_get_offset to retrieve the module base offset.
Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
@@ -2562,9 +2563,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
- DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
- DTPOffset, MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ DTPOffset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), DTPOffset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
break;
@@ -2575,8 +2577,8 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
SystemZII::MO_INDNTPOFF);
Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
- Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
- Offset, MachinePointerInfo::getGOT(),
+ Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
false, false, false, 0);
break;
}
@@ -2587,9 +2589,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
- Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
- Offset, MachinePointerInfo::getConstantPool(),
- false, false, false, 0);
+ Offset = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, 0);
break;
}
}
@@ -2628,10 +2631,10 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
SDValue Result;
if (CP->isMachineConstantPoolEntry())
Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
- CP->getAlignment());
+ CP->getAlignment());
else
Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
- CP->getAlignment(), CP->getOffset());
+ CP->getAlignment(), CP->getOffset());
// Use LARL to load the address of the constant pool entry.
return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -2736,17 +2739,37 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
SDValue SystemZTargetLowering::
lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+ bool RealignOpt = !DAG.getMachineFunction().getFunction()->
+ hasFnAttribute("no-realign-stack");
+
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
+ SDValue Align = Op.getOperand(2);
SDLoc DL(Op);
+ // If user has set the no alignment function attribute, ignore
+ // alloca alignments.
+ uint64_t AlignVal = (RealignOpt ?
+ dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+
+ uint64_t StackAlign = TFI->getStackAlignment();
+ uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
+ uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
+
unsigned SPReg = getStackPointerRegisterToSaveRestore();
+ SDValue NeededSpace = Size;
// Get a reference to the stack pointer.
SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
+ // Add extra space for alignment if needed.
+ if (ExtraAlignSpace)
+ NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
+ DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+
// Get the new stack pointer value.
- SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size);
+ SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
// Copy the new stack pointer back.
Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
@@ -2757,6 +2780,16 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
+ // Dynamically realign if needed.
+ if (RequiredAlign > StackAlign) {
+ Result =
+ DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
+ DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+ Result =
+ DAG.getNode(ISD::AND, DL, MVT::i64, Result,
+ DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
+ }
+
SDValue Ops[2] = { Result, Chain };
return DAG.getMergeValues(Ops, DL);
}
@@ -2837,7 +2870,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
} else if (DAG.ComputeNumSignBits(Op1) > 32) {
Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
Opcode = SystemZISD::SDIVREM32;
- } else
+ } else
Opcode = SystemZISD::SDIVREM64;
// DSG(F) takes a 64-bit dividend, so the even register in the GR128
@@ -3247,8 +3280,8 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Op->getNumValues() == 1)
return CC;
assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
- return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
- Glued, CC);
+ return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
+ CC);
}
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3890,7 +3923,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
GS.addUndef();
} else {
GS.add(SDValue(), ResidueOps.size());
- ResidueOps.push_back(Op);
+ ResidueOps.push_back(BVN->getOperand(I));
}
}
@@ -3901,7 +3934,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
// Create the BUILD_VECTOR for the remaining elements, if any.
if (!ResidueOps.empty()) {
while (ResidueOps.size() < NumElements)
- ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType()));
+ ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
for (auto &Op : GS.Ops) {
if (!Op.getNode()) {
Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps);
@@ -4204,7 +4237,7 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
- unsigned UnpackHigh) const {
+ unsigned UnpackHigh) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
@@ -4566,9 +4599,9 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
}
return Op;
} else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
- Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
- Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
- canTreatAsByteVector(Op.getValueType()) &&
+ Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
+ canTreatAsByteVector(Op.getValueType()) &&
canTreatAsByteVector(Op.getOperand(0).getValueType())) {
// Make sure that only the unextended bits are significant.
EVT ExtVT = Op.getValueType();
@@ -4579,14 +4612,14 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
unsigned SubByte = Byte % ExtBytesPerElement;
unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
if (SubByte < MinSubByte ||
- SubByte + BytesPerElement > ExtBytesPerElement)
- break;
+ SubByte + BytesPerElement > ExtBytesPerElement)
+ break;
// Get the byte offset of the unextended element
Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
// ...then add the byte offset relative to that element.
Byte += SubByte - MinSubByte;
if (Byte % BytesPerElement != 0)
- break;
+ break;
Op = Op.getOperand(0);
Index = Byte / BytesPerElement;
Force = true;
@@ -5611,6 +5644,31 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
return MBB;
}
+MachineBasicBlock *
+SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned SrcReg = MI->getOperand(0).getReg();
+
+ // Create new virtual register of the same class as source.
+ const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+ unsigned DstReg = MRI->createVirtualRegister(RC);
+
+ // Replace pseudo with a normal load-and-test that models the def as
+ // well.
+ BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+ .addReg(SrcReg);
+ MI->eraseFromParent();
+
+ return MBB;
+}
+
MachineBasicBlock *SystemZTargetLowering::
EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
switch (MI->getOpcode()) {
@@ -5858,6 +5916,13 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
case SystemZ::TBEGINC:
return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
+ case SystemZ::LTEBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
+ case SystemZ::LTDBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
+ case SystemZ::LTXBRCompare_VecPseudo:
+ return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
+
default:
llvm_unreachable("Unexpected instr type to insert");
}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 07ff25144581..391636e5467f 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -409,6 +409,20 @@ public:
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return SystemZ::R6D;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return SystemZ::R7D;
+ }
+
MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const
override;
@@ -481,7 +495,7 @@ private:
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
- unsigned UnpackHigh) const;
+ unsigned UnpackHigh) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
@@ -530,6 +544,10 @@ private:
MachineBasicBlock *MBB,
unsigned Opcode,
bool NoFloat) const;
+ MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr *MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode) const;
+
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
index 464f79a3bac9..5a1c874dfa36 100644
--- a/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -35,11 +35,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
if (MCID.mayStore())
Flags |= MachineMemOperand::MOStore;
int64_t Offset = 0;
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo(
- PseudoSourceValue::getFixedStack(FI), Offset),
- Flags, MFFrame->getObjectSize(FI),
- MFFrame->getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFFrame->getObjectSize(FI), MFFrame->getObjectAlignment(FI));
return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
}
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 27fbd7df2882..0cb267290cc1 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -46,15 +46,28 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
}
-// Note that the comparison against zero operation is not available if we
-// have vector support, since load-and-test instructions will partially
-// clobber the target (vector) register.
+// Note that LTxBRCompare is not available if we have vector support,
+// since load-and-test instructions will partially clobber the target
+// (vector) register.
let Predicates = [FeatureNoVector] in {
defm : CompareZeroFP<LTEBRCompare, FP32>;
defm : CompareZeroFP<LTDBRCompare, FP64>;
defm : CompareZeroFP<LTXBRCompare, FP128>;
}
+// Use a normal load-and-test for compare against zero in case of
+// vector support (via a pseudo to simplify instruction selection).
+let Defs = [CC], usesCustomInserter = 1 in {
+ def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
+ def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
+ def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
+}
+let Predicates = [FeatureVector] in {
+ defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
+ defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
+ defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
+}
+
// Moves between 64-bit integer and floating-point registers.
def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>;
@@ -238,26 +251,46 @@ let Predicates = [FeatureFPExtension] in {
// Unary arithmetic
//===----------------------------------------------------------------------===//
+// We prefer generic instructions during isel, because they do not
+// clobber CC and therefore give the scheduler more freedom. In cases
+// the CC is actually useful, the SystemZElimCompare pass will try to
+// convert generic instructions into opcodes that also set CC. Note
+// that lcdf / lpdf / lndf only affect the sign bit, and can therefore
+// be used with fp32 as well. This could be done for fp128, in which
+// case the operands would have to be tied.
+
// Negation (Load Complement).
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
- def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32, FP32>;
- def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64, FP64>;
+ def LCEBR : UnaryRRE<"lceb", 0xB303, null_frag, FP32, FP32>;
+ def LCDBR : UnaryRRE<"lcdb", 0xB313, null_frag, FP64, FP64>;
def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>;
}
+// Generic form, which does not set CC.
+def LCDFR : UnaryRRE<"lcdf", 0xB373, fneg, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LCDFR_32 : UnaryRRE<"lcdf", 0xB373, fneg, FP32, FP32>;
// Absolute value (Load Positive).
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
- def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32, FP32>;
- def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64, FP64>;
+ def LPEBR : UnaryRRE<"lpeb", 0xB300, null_frag, FP32, FP32>;
+ def LPDBR : UnaryRRE<"lpdb", 0xB310, null_frag, FP64, FP64>;
def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>;
}
+// Generic form, which does not set CC.
+def LPDFR : UnaryRRE<"lpdf", 0xB370, fabs, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LPDFR_32 : UnaryRRE<"lpdf", 0xB370, fabs, FP32, FP32>;
// Negative absolute value (Load Negative).
let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
- def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32, FP32>;
- def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64, FP64>;
+ def LNEBR : UnaryRRE<"lneb", 0xB301, null_frag, FP32, FP32>;
+ def LNDBR : UnaryRRE<"lndb", 0xB311, null_frag, FP64, FP64>;
def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>;
}
+// Generic form, which does not set CC.
+def LNDFR : UnaryRRE<"lndf", 0xB371, fnabs, FP64, FP64>;
+let isCodeGenOnly = 1 in
+ def LNDFR_32 : UnaryRRE<"lndf", 0xB371, fnabs, FP32, FP32>;
// Square root.
def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32, FP32>;
@@ -414,6 +447,6 @@ let Defs = [CC], CCValues = 0xF in {
// Peepholes
//===----------------------------------------------------------------------===//
-def : Pat<(f32 fpimmneg0), (LCEBR (LZER))>;
-def : Pat<(f64 fpimmneg0), (LCDBR (LZDR))>;
+def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>;
+def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>;
def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 71eb9986499b..01f4cdec05cb 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2381,6 +2381,7 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
(ins GR64:$R1src, GR64:$R2src),
mnemonic#"\t$R1, $R2", []> {
+ let Uses = [R0L];
let Constraints = "$R1 = $R1src, $R2 = $R2src";
let DisableEncoding = "$R1src, $R2src";
}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 5d4a34f7131c..e6b5fc8e6235 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -69,6 +69,11 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
MachineOperand &LowOffsetOp = MI->getOperand(2);
LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
+ // Clear the kill flags for the base and index registers in the first
+ // instruction.
+ EarlierMI->getOperand(1).setIsKill(false);
+ EarlierMI->getOperand(3).setIsKill(false);
+
// Set the opcodes.
unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
@@ -111,7 +116,7 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
}
// MI is a three-operand RIE-style pseudo instruction. Replace it with
-// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// LowOpcodeK if the registers are both low GR32s, otherwise use a move
// followed by HighOpcode or LowOpcode, depending on whether the target
// is a high or low GR32.
void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
@@ -129,6 +134,7 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
MI->getOperand(1).isKill());
MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
MI->getOperand(1).setReg(DestReg);
+ MI->tieOperands(0, 1);
}
}
@@ -486,11 +492,8 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
const MachineRegisterInfo *MRI) const {
assert(!SrcReg2 && "Only optimizing constant comparisons so far");
bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
- if (Value == 0 &&
- !IsLogical &&
- removeIPMBasedCompare(Compare, SrcReg, MRI, &RI))
- return true;
- return false;
+ return Value == 0 && !IsLogical &&
+ removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
}
// If Opcode is a move that has a conditional variant, return that variant,
@@ -505,16 +508,13 @@ static unsigned getConditionalMove(unsigned Opcode) {
bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
unsigned Opcode = MI->getOpcode();
- if (STI.hasLoadStoreOnCond() &&
- getConditionalMove(Opcode))
- return true;
- return false;
+ return STI.hasLoadStoreOnCond() && getConditionalMove(Opcode);
}
bool SystemZInstrInfo::
isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCycles, unsigned ExtraPredCycles,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
// For now only convert single instructions.
return NumCycles == 1;
}
@@ -524,7 +524,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumCyclesT, unsigned ExtraPredCyclesT,
MachineBasicBlock &FMBB,
unsigned NumCyclesF, unsigned ExtraPredCyclesF,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
// For now avoid converting mutually-exclusive cases.
return false;
}
@@ -548,11 +548,10 @@ PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const {
return false;
}
-void
-SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const {
+void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
// Split 128-bit GPR moves into two 64-bit moves. This handles ADDR128 too.
if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
@@ -590,13 +589,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc));
}
-void
-SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill,
- int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+void SystemZInstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+ bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
// Callers may expect a single instruction, so keep 128-bit moves
@@ -604,15 +600,14 @@ SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
unsigned LoadOpcode, StoreOpcode;
getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
- .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+ .addReg(SrcReg, getKillRegState(isKill)),
+ FrameIdx);
}
-void
-SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIdx,
- const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) const {
+void SystemZInstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+ int FrameIdx, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
// Callers may expect a single instruction, so keep 128-bit moves
@@ -681,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
LiveVariables *LV) const {
MachineInstr *MI = MBBI;
MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned Opcode = MI->getOpcode();
unsigned NumOps = MI->getNumOperands();
@@ -708,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
}
int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
if (ThreeOperandOpcode >= 0) {
- MachineInstrBuilder MIB =
- BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
- .addOperand(Dest);
+ // Create three address instruction without adding the implicit
+ // operands. Those will instead be copied over from the original
+ // instruction by the loop below.
+ MachineInstrBuilder MIB(*MF,
+ MF->CreateMachineInstr(get(ThreeOperandOpcode),
+ MI->getDebugLoc(), /*NoImplicit=*/true));
+ MIB.addOperand(Dest);
// Keep the kill state, but drop the tied flag.
MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
// Keep the remaining operands as-is.
for (unsigned I = 2; I < NumOps; ++I)
MIB.addOperand(MI->getOperand(I));
+ MBB->insert(MI, MIB);
return finishConvertToThreeAddress(MI, MIB, LV);
}
}
@@ -1191,6 +1192,12 @@ unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
case SystemZ::LER: return SystemZ::LTEBR;
case SystemZ::LDR: return SystemZ::LTDBR;
case SystemZ::LXR: return SystemZ::LTXBR;
+ case SystemZ::LCDFR: return SystemZ::LCDBR;
+ case SystemZ::LPDFR: return SystemZ::LPDBR;
+ case SystemZ::LNDFR: return SystemZ::LNDBR;
+ case SystemZ::LCDFR_32: return SystemZ::LCEBR;
+ case SystemZ::LPDFR_32: return SystemZ::LPEBR;
+ case SystemZ::LNDFR_32: return SystemZ::LNEBR;
// On zEC12 we prefer to use RISBGN. But if there is a chance to
// actually use the condition code, we may turn it back into RISGB.
// Note that RISBG is not really a "load-and-test" instruction,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 31c9db209585..d9094ba93658 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -159,12 +159,12 @@ public:
bool isPredicable(MachineInstr *MI) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumCyclesT, unsigned ExtraPredCyclesT,
MachineBasicBlock &FMBB,
unsigned NumCyclesF, unsigned ExtraPredCyclesF,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool PredicateInstruction(MachineInstr *MI,
ArrayRef<MachineOperand> Pred) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 820f30bc173d..b9f2eb5514a5 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -397,7 +397,7 @@ let mayLoad = 1, mayStore = 1 in
defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
// String moves.
-let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
//===----------------------------------------------------------------------===//
@@ -424,7 +424,7 @@ let hasSideEffects = 0 in {
def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>;
}
let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
- def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>;
+ def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR32>;
// Match 32-to-64-bit sign extensions in which the source is already
// in a 64-bit register.
@@ -490,7 +490,7 @@ def : Pat<(and GR64:$src, 0xffffffff),
def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
Requires<[FeatureHighWord]>;
def LLC : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
-def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
Requires<[FeatureHighWord]>;
// 32-bit extensions from 16-bit memory. LLHMux expands to LLH or LLHH,
@@ -498,7 +498,7 @@ def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
Requires<[FeatureHighWord]>;
def LLH : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
-def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
Requires<[FeatureHighWord]>;
def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
@@ -1147,7 +1147,7 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
Requires<[FeatureHighWord]>;
def CLFI : CompareRIL<"clfi", 0xC2F, z_ucmp, GR32, uimm32>;
- def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GR32, uimm32>,
+ def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GRH32, uimm32>,
Requires<[FeatureHighWord]>;
def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
@@ -1185,7 +1185,7 @@ let mayLoad = 1, Defs = [CC] in
defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
// String comparison.
-let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, Defs = [CC] in
defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
// Test under mask.
@@ -1459,9 +1459,29 @@ let usesCustomInserter = 1 in {
}
// Search a block of memory for a character.
-let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+let mayLoad = 1, Defs = [CC] in
defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+// Other instructions for inline assembly
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+ def STCK : InstS<0xB205, (outs), (ins bdaddr12only:$BD2),
+ "stck\t$BD2",
+ []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+ def STCKF : InstS<0xB27C, (outs), (ins bdaddr12only:$BD2),
+ "stckf\t$BD2",
+ []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+ def STCKE : InstS<0xB278, (outs), (ins bdaddr12only:$BD2),
+ "stcke\t$BD2",
+ []>;
+let hasSideEffects = 1, Defs = [CC], mayStore = 1 in
+ def STFLE : InstS<0xB2B0, (outs), (ins bdaddr12only:$BD2),
+ "stfle\t$BD2",
+ []>;
+
+
+
//===----------------------------------------------------------------------===//
// Peepholes.
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
index 00572d0b9d79..1a7c0d7f687a 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
+//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 34fc36d6bf6c..f4a517bd54df 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index dc7bd25d7ed5..6fd24e3df625 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -69,8 +69,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// Decompose the frame index into a base and offset.
int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
- unsigned BasePtr = getFrameRegister(MF);
- int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) +
+ unsigned BasePtr;
+ int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
MI->getOperand(FIOperandNum + 1).getImm());
// Special handling of dbg_value instructions.
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 85aa0a62cc76..0d8b08b9cbdd 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -282,4 +282,5 @@ def v128any : TypedReg<untyped, VR128>;
// The 2-bit condition code field of the PSW. Every register named in an
// inline asm needs a class associated with it.
def CC : SystemZReg<"cc">;
-def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+let isAllocatable = 0 in
+ def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index d1a17c5500d6..846edd51341a 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -16,6 +16,8 @@
#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
@@ -35,19 +37,16 @@ public:
bool runOnMachineFunction(MachineFunction &F) override;
private:
- bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
- unsigned LLIxL, unsigned LLIxH);
+ bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH);
bool shortenOn0(MachineInstr &MI, unsigned Opcode);
bool shortenOn01(MachineInstr &MI, unsigned Opcode);
bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+ bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
const SystemZInstrInfo *TII;
-
- // LowGPRs[I] has bit N set if LLVM register I includes the low
- // word of GPR N. HighGPRs is the same for the high word.
- unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
- unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+ const TargetRegisterInfo *TRI;
+ LivePhysRegs LiveRegs;
};
char SystemZShortenInst::ID = 0;
@@ -58,33 +57,31 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
}
SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
- : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() {
- // Set up LowGPRs and HighGPRs.
- for (unsigned I = 0; I < 16; ++I) {
- LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
- LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
- HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
- HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
- if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
- LowGPRs[GR128] |= 3 << I;
- HighGPRs[GR128] |= 3 << I;
- }
- }
+ : MachineFunctionPass(ID), TII(nullptr) {}
+
+// Tie operands if MI has become a two-address instruction.
+static void tieOpsIfNeeded(MachineInstr &MI) {
+ if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+ !MI.getOperand(0).isTied())
+ MI.tieOperands(0, 1);
}
// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
// are the halfword immediate loads for the same word. Try to use one of them
-// instead of IIxF. If MI loads the high word, GPRMap[X] is the set of high
-// words referenced by LLVM register X while LiveOther is the mask of low
-// words that are currently live, and vice versa.
-bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
- unsigned LiveOther, unsigned LLIxL,
- unsigned LLIxH) {
+// instead of IIxF.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI,
+ unsigned LLIxL, unsigned LLIxH) {
unsigned Reg = MI.getOperand(0).getReg();
- assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
- unsigned GPRs = GPRMap[Reg];
- assert(GPRs != 0 && "Register must be a GPR");
- if (GPRs & LiveOther)
+ // The new opcode will clear the other half of the GR64 reg, so
+ // cancel if that is live.
+ unsigned thisSubRegIdx = (SystemZ::GRH32BitRegClass.contains(Reg) ?
+ SystemZ::subreg_h32 : SystemZ::subreg_l32);
+ unsigned otherSubRegIdx = (thisSubRegIdx == SystemZ::subreg_l32 ?
+ SystemZ::subreg_h32 : SystemZ::subreg_l32);
+ unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx,
+ &SystemZ::GR64BitRegClass);
+ unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+ if (LiveRegs.contains(OtherReg))
return false;
uint64_t Imm = MI.getOperand(1).getImm();
@@ -123,12 +120,26 @@ bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
}
// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
-// 4-bit encoding and if operands 0 and 1 are tied.
+// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0
+// with op 1, if MI becomes 2-address.
bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
MI.setDesc(TII->get(Opcode));
+ tieOpsIfNeeded(MI);
+ return true;
+ }
+ return false;
+}
+
+// Calls shortenOn001 if CCLive is false. CC def operand is added in
+// case of success.
+bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI,
+ unsigned Opcode) {
+ if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) {
+ MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+ .addReg(SystemZ::CC, RegState::ImplicitDefine);
return true;
}
return false;
@@ -164,35 +175,24 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
- // Work out which words are live on exit from the block.
- unsigned LiveLow = 0;
- unsigned LiveHigh = 0;
- for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) {
- for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end();
- LI != LE; ++LI) {
- unsigned Reg = *LI;
- assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
- LiveLow |= LowGPRs[Reg];
- LiveHigh |= HighGPRs[Reg];
- }
- }
+ // Set up the set of live registers at the end of MBB (live out)
+ LiveRegs.clear();
+ LiveRegs.addLiveOuts(&MBB);
// Iterate backwards through the block looking for instructions to change.
for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
MachineInstr &MI = *MBBI;
switch (MI.getOpcode()) {
case SystemZ::IILF:
- Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
- SystemZ::LLILH);
+ Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH);
break;
case SystemZ::IIHF:
- Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
- SystemZ::LLIHH);
+ Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH);
break;
case SystemZ::WFADB:
- Changed |= shortenOn001(MI, SystemZ::ADBR);
+ Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
break;
case SystemZ::WFDDB:
@@ -216,15 +216,15 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
break;
case SystemZ::WFLCDB:
- Changed |= shortenOn01(MI, SystemZ::LCDBR);
+ Changed |= shortenOn01(MI, SystemZ::LCDFR);
break;
case SystemZ::WFLNDB:
- Changed |= shortenOn01(MI, SystemZ::LNDBR);
+ Changed |= shortenOn01(MI, SystemZ::LNDFR);
break;
case SystemZ::WFLPDB:
- Changed |= shortenOn01(MI, SystemZ::LPDBR);
+ Changed |= shortenOn01(MI, SystemZ::LPDFR);
break;
case SystemZ::WFSQDB:
@@ -232,7 +232,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
break;
case SystemZ::WFSDB:
- Changed |= shortenOn001(MI, SystemZ::SDBR);
+ Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
break;
case SystemZ::WFCDB:
@@ -257,33 +257,17 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
break;
}
- unsigned UsedLow = 0;
- unsigned UsedHigh = 0;
- for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
- MOI != MOE; ++MOI) {
- MachineOperand &MO = *MOI;
- if (MO.isReg()) {
- if (unsigned Reg = MO.getReg()) {
- assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
- if (MO.isDef()) {
- LiveLow &= ~LowGPRs[Reg];
- LiveHigh &= ~HighGPRs[Reg];
- } else if (!MO.isUndef()) {
- UsedLow |= LowGPRs[Reg];
- UsedHigh |= HighGPRs[Reg];
- }
- }
- }
- }
- LiveLow |= UsedLow;
- LiveHigh |= UsedHigh;
+ LiveRegs.stepBackward(MI);
}
return Changed;
}
bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
- TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+ const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ LiveRegs.init(TRI);
bool Changed = false;
for (auto &MBB : F)
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 00cbbd10a819..f305e85f6cfe 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -16,6 +16,7 @@
using namespace llvm;
+extern cl::opt<bool> MISchedPostRA;
extern "C" void LLVMInitializeSystemZTarget() {
// Register the target.
RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
@@ -32,7 +33,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
VectorABI = false;
SmallVector<StringRef, 3> Features;
- FS.split(Features, ",", -1, false /* KeepEmpty */);
+ FS.split(Features, ',', -1, false /* KeepEmpty */);
for (auto &Feature : Features) {
if (Feature == "vector" || Feature == "+vector")
VectorABI = true;
@@ -130,6 +131,13 @@ void SystemZPassConfig::addPreSched2() {
}
void SystemZPassConfig::addPreEmitPass() {
+
+ // Do instruction shortening before compare elimination because some
+ // vector instructions will be shortened into opcodes that compare
+ // elimination recognizes.
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
+
// We eliminate comparisons here rather than earlier because some
// transformations can change the set of available CC values and we
// generally want those transformations to have priority. This is
@@ -155,9 +163,17 @@ void SystemZPassConfig::addPreEmitPass() {
// preventing that would be a win or not.
if (getOptLevel() != CodeGenOpt::None)
addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
- if (getOptLevel() != CodeGenOpt::None)
- addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+ // Do final scheduling after all other optimizations, to get an
+ // optimal input for the decoder (branch relaxation must happen
+ // after block placement).
+ if (getOptLevel() != CodeGenOpt::None) {
+ if (MISchedPostRA)
+ addPass(&PostMachineSchedulerID);
+ else
+ addPass(&PostRASchedulerID);
+ }
}
TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
@@ -165,7 +181,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
}
TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(SystemZTTIImpl(this, F));
});
}
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 0a81e1f9fdf9..1a8f1f7f3aaa 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -43,6 +43,9 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+
+ bool targetSchedulesPostRAScheduling() const override { return true; };
+
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 5a87df1976c3..5ff5b21f49b8 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
//
//===----------------------------------------------------------------------===//
-unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -63,8 +63,8 @@ unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
return 4 * TTI::TCC_Basic;
}
-unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -181,8 +181,8 @@ unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
return SystemZTTIImpl::getIntImmCost(Imm, Ty);
}
-unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 4b80973ed879..9ae736d8413a 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,7 +28,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
const SystemZTargetLowering *getTLI() const { return TLI; }
public:
- explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F)
+ explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -42,12 +42,11 @@ public:
/// \name Scalar TTI Implementations
/// @{
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
- unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
- unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 19b5e2a0f978..a0b0d8f24046 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -43,7 +43,6 @@ using namespace llvm;
void TargetLoweringObjectFile::Initialize(MCContext &ctx,
const TargetMachine &TM) {
Ctx = &ctx;
- DL = TM.getDataLayout();
InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(),
TM.getCodeModel(), *Ctx);
}
@@ -107,7 +106,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
assert(!Suffix.empty());
SmallString<60> NameStr;
- NameStr += DL->getPrivateGlobalPrefix();
+ NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
TM.getNameWithPrefix(NameStr, GV, Mang);
NameStr.append(Suffix.begin(), Suffix.end());
return Ctx->getOrCreateSymbol(NameStr);
@@ -120,7 +119,7 @@ MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
}
void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
- const TargetMachine &TM,
+ const DataLayout &,
const MCSymbol *Sym) const {
}
@@ -170,14 +169,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
// If the initializer for the global contains something that requires a
// relocation, then we may have to drop this into a writable data section
// even though it is marked const.
- switch (C->getRelocationInfo()) {
- case Constant::NoRelocation:
+ if (!C->needsRelocation()) {
// If the global is required to have a unique address, it can't be put
// into a mergable section: just drop it into the general read-only
// section instead.
if (!GVar->hasUnnamedAddr())
return SectionKind::getReadOnly();
-
+
// If initializer is a null-terminated string, put it in a "cstring"
// section of the right width.
if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
@@ -200,7 +198,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
// Otherwise, just drop it into a mergable constant section. If we have
// a section for this size, use it, otherwise use the arbitrary sized
// mergable section.
- switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
+ switch (GV->getParent()->getDataLayout().getTypeAllocSize(C->getType())) {
case 4: return SectionKind::getMergeableConst4();
case 8: return SectionKind::getMergeableConst8();
case 16: return SectionKind::getMergeableConst16();
@@ -208,20 +206,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
return SectionKind::getReadOnly();
}
- case Constant::LocalRelocation:
- // In static relocation model, the linker will resolve all addresses, so
- // the relocation entries will actually be constants by the time the app
- // starts up. However, we can't put this into a mergable section, because
- // the linker doesn't take relocations into consideration when it tries to
- // merge entries in the section.
- if (ReloModel == Reloc::Static)
- return SectionKind::getReadOnly();
-
- // Otherwise, the dynamic linker needs to fix it up, put it in the
- // writable data.rel.local section.
- return SectionKind::getReadOnlyWithRelLocal();
-
- case Constant::GlobalRelocations:
+ } else {
// In static relocation model, the linker will resolve all addresses, so
// the relocation entries will actually be constants by the time the app
// starts up. However, we can't put this into a mergable section, because
@@ -242,17 +227,11 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
// globals together onto fewer pages, improving the locality of the dynamic
// linker.
if (ReloModel == Reloc::Static)
- return SectionKind::getDataNoRel();
-
- switch (C->getRelocationInfo()) {
- case Constant::NoRelocation:
- return SectionKind::getDataNoRel();
- case Constant::LocalRelocation:
- return SectionKind::getDataRelLocal();
- case Constant::GlobalRelocations:
- return SectionKind::getDataRel();
- }
- llvm_unreachable("Invalid relocation");
+ return SectionKind::getData();
+
+ if (C->needsRelocation())
+ return SectionKind::getData();
+ return SectionKind::getData();
}
/// This method computes the appropriate section to emit the specified global
@@ -273,7 +252,8 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalValue *GV,
MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
const Function &F, Mangler &Mang, const TargetMachine &TM) const {
- return getSectionForConstant(SectionKind::getReadOnly(), /*C=*/nullptr);
+ return getSectionForConstant(F.getParent()->getDataLayout(),
+ SectionKind::getReadOnly(), /*C=*/nullptr);
}
bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
@@ -296,9 +276,8 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
/// Given a mergable constant with the specified size and relocation
/// information, return a section that it should be placed in.
-MCSection *
-TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind,
- const Constant *C) const {
+MCSection *TargetLoweringObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C) const {
if (Kind.isReadOnly() && ReadOnlySection != nullptr)
return ReadOnlySection;
@@ -345,7 +324,7 @@ const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol
}
void TargetLoweringObjectFile::getNameWithPrefix(
- SmallVectorImpl<char> &OutName, const GlobalValue *GV,
- bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const {
- Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+ SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+ const TargetMachine &TM) const {
+ Mang.getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false);
}
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 83174c20c8e9..850c93cb21b8 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -150,24 +150,11 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
}
TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(F.getParent()->getDataLayout());
});
}
-static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
- const MCSection &Section) {
- if (!AsmInfo.isSectionAtomizableBySymbols(Section))
- return true;
-
- // If it is not dead stripped, it is safe to use private labels.
- const MCSectionMachO &SMO = cast<MCSectionMachO>(Section);
- if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
- return true;
-
- return false;
-}
-
void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
const GlobalValue *GV, Mangler &Mang,
bool MayAlwaysUsePrivate) const {
@@ -177,11 +164,8 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
Mang.getNameWithPrefix(Name, GV, false);
return;
}
- SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
const TargetLoweringObjectFile *TLOF = getObjFileLowering();
- const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this);
- bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection);
- TLOF->getNameWithPrefix(Name, GV, CannotUsePrivateLabel, Mang, *this);
+ TLOF->getNameWithPrefix(Name, GV, Mang, *this);
}
MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 719923558de4..f82566c37baa 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -32,17 +32,25 @@
using namespace llvm;
-inline TargetMachine *unwrap(LLVMTargetMachineRef P) {
- return reinterpret_cast<TargetMachine*>(P);
+namespace llvm {
+// Friend to the TargetMachine, access legacy API that are made private in C++
+struct C_API_PRIVATE_ACCESS {
+ static const DataLayout &getDataLayout(const TargetMachine &T) {
+ return T.getDataLayout();
+ }
+};
+}
+
+static TargetMachine *unwrap(LLVMTargetMachineRef P) {
+ return reinterpret_cast<TargetMachine *>(P);
}
-inline Target *unwrap(LLVMTargetRef P) {
+static Target *unwrap(LLVMTargetRef P) {
return reinterpret_cast<Target*>(P);
}
-inline LLVMTargetMachineRef wrap(const TargetMachine *P) {
- return
- reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P));
+static LLVMTargetMachineRef wrap(const TargetMachine *P) {
+ return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P));
}
-inline LLVMTargetRef wrap(const Target * P) {
+static LLVMTargetRef wrap(const Target * P) {
return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P));
}
@@ -69,16 +77,16 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
char **ErrorMessage) {
std::string Error;
-
+
*T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
-
+
if (!*T) {
if (ErrorMessage)
*ErrorMessage = strdup(Error.c_str());
return 1;
}
-
+
return 0;
}
@@ -145,10 +153,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
CM, OL));
}
-
-void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) {
- delete unwrap(T);
-}
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
const Target* target = &(unwrap(T)->getTarget());
@@ -170,8 +175,9 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
return strdup(StringRep.c_str());
}
+/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */
LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
- return wrap(unwrap(T)->getDataLayout());
+ return wrap(&C_API_PRIVATE_ACCESS::getDataLayout(*unwrap(T)));
}
void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
@@ -190,14 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
std::string error;
- const DataLayout *td = TM->getDataLayout();
-
- if (!td) {
- error = "No DataLayout in TargetMachine";
- *ErrorMessage = strdup(error.c_str());
- return true;
- }
- Mod->setDataLayout(*td);
+ Mod->setDataLayout(TM->createDataLayout());
TargetMachine::CodeGenFileType ft;
switch (codegen) {
@@ -239,7 +238,6 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
SmallString<0> CodeString;
raw_svector_ostream OStream(CodeString);
bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage);
- OStream.flush();
StringRef Data = OStream.str();
*OutMemBuf =
diff --git a/lib/Target/TargetRecip.cpp b/lib/Target/TargetRecip.cpp
index 42bc487fe6d8..d41b6436928b 100644
--- a/lib/Target/TargetRecip.cpp
+++ b/lib/Target/TargetRecip.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
// the key strings for queries and command-line inputs.
// In addition, the command-line interface recognizes the global parameters
// "all", "none", and "default".
-static const char *RecipOps[] = {
+static const char *const RecipOps[] = {
"divd",
"divf",
"vec-divd",
@@ -46,7 +46,7 @@ TargetRecip::TargetRecip() {
RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
}
-static bool parseRefinementStep(const StringRef &In, size_t &Position,
+static bool parseRefinementStep(StringRef In, size_t &Position,
uint8_t &Value) {
const char RefStepToken = ':';
Position = In.find(RefStepToken);
@@ -175,7 +175,7 @@ TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
parseIndividualParams(Args);
}
-bool TargetRecip::isEnabled(const StringRef &Key) const {
+bool TargetRecip::isEnabled(StringRef Key) const {
ConstRecipIter Iter = RecipMap.find(Key);
assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
assert(Iter->second.Enabled != Uninitialized &&
@@ -183,7 +183,7 @@ bool TargetRecip::isEnabled(const StringRef &Key) const {
return Iter->second.Enabled;
}
-unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
+unsigned TargetRecip::getRefinementSteps(StringRef Key) const {
ConstRecipIter Iter = RecipMap.find(Key);
assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
assert(Iter->second.RefinementSteps != Uninitialized &&
@@ -192,7 +192,7 @@ unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
}
/// Custom settings (previously initialized values) override target defaults.
-void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
+void TargetRecip::setDefaults(StringRef Key, bool Enable,
unsigned RefSteps) {
if (Key == "all") {
for (auto &KV : RecipMap) {
@@ -213,7 +213,7 @@ void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
bool TargetRecip::operator==(const TargetRecip &Other) const {
for (const auto &KV : RecipMap) {
- const StringRef &Op = KV.first;
+ StringRef Op = KV.first;
const RecipParams &RP = KV.second;
const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
if (RP.RefinementSteps != OtherRP.RefinementSteps)
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 25de9eee0831..284a7d91bc62 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -1,20 +1,39 @@
set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
+tablegen(LLVM WebAssemblyGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM WebAssemblyGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
+tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
add_public_tablegen_target(WebAssemblyCommonTableGen)
add_llvm_target(WebAssemblyCodeGen
+ Relooper.cpp
+ WebAssemblyArgumentMove.cpp
+ WebAssemblyAsmPrinter.cpp
+ WebAssemblyCFGStackify.cpp
+ WebAssemblyFastISel.cpp
WebAssemblyFrameLowering.cpp
- WebAssemblyInstrInfo.cpp
WebAssemblyISelDAGToDAG.cpp
WebAssemblyISelLowering.cpp
+ WebAssemblyInstrInfo.cpp
+ WebAssemblyLowerBrUnless.cpp
WebAssemblyMachineFunctionInfo.cpp
+ WebAssemblyMCInstLower.cpp
+ WebAssemblyOptimizeReturned.cpp
+ WebAssemblyPeephole.cpp
+ WebAssemblyPEI.cpp
WebAssemblyRegisterInfo.cpp
+ WebAssemblyRegColoring.cpp
+ WebAssemblyRegNumbering.cpp
+ WebAssemblyRegStackify.cpp
WebAssemblySelectionDAGInfo.cpp
+ WebAssemblyStoreResults.cpp
WebAssemblySubtarget.cpp
WebAssemblyTargetMachine.cpp
+ WebAssemblyTargetObjectFile.cpp
WebAssemblyTargetTransformInfo.cpp
)
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index fbb985aaafbb..7ce3a00ae360 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -13,7 +13,9 @@
//===----------------------------------------------------------------------===//
#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -21,11 +23,13 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
-#include <cctype>
+#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
+#include "WebAssemblyGenAsmWriter.inc"
+
WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
const MCInstrInfo &MII,
const MCRegisterInfo &MRI)
@@ -33,11 +37,93 @@ WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
unsigned RegNo) const {
- llvm_unreachable("TODO: implement printRegName");
+ assert(RegNo != WebAssemblyFunctionInfo::UnusedReg);
+ // Note that there's an implicit get_local/set_local here!
+ OS << "$" << RegNo;
}
void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot,
- const MCSubtargetInfo &STI) {
- llvm_unreachable("TODO: implement printInst");
+ const MCSubtargetInfo & /*STI*/) {
+ // Print the instruction (this uses the AsmStrings from the .td files).
+ printInstruction(MI, OS);
+
+ // Print any additional variadic operands.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (Desc.isVariadic())
+ for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) {
+ if (i != 0)
+ OS << ", ";
+ printOperand(MI, i, OS);
+ }
+
+ // Print any added annotation.
+ printAnnotation(OS, Annot);
+}
+
+static std::string toString(const APFloat &FP) {
+ static const size_t BufBytes = 128;
+ char buf[BufBytes];
+ if (FP.isNaN())
+ assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) ||
+ FP.bitwiseIsEqual(
+ APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) &&
+ "convertToHexString handles neither SNaN nor NaN payloads");
+ // Use C99's hexadecimal floating-point representation.
+ auto Written = FP.convertToHexString(
+ buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
+ (void)Written;
+ assert(Written != 0);
+ assert(Written < BufBytes);
+ return buf;
+}
+
+void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned WAReg = Op.getReg();
+ if (int(WAReg) >= 0)
+ printRegName(O, WAReg);
+ else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
+ O << "$pop" << (WAReg & INT32_MAX);
+ else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
+ O << "$push" << (WAReg & INT32_MAX);
+ else
+ O << "$discard";
+ // Add a '=' suffix if this is a def.
+ if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
+ O << '=';
+ } else if (Op.isImm()) {
+ switch (MI->getOpcode()) {
+ case WebAssembly::PARAM:
+ case WebAssembly::RESULT:
+ case WebAssembly::LOCAL:
+ O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm()));
+ break;
+ default:
+ O << Op.getImm();
+ break;
+ }
+ } else if (Op.isFPImm())
+ O << toString(APFloat(Op.getFPImm()));
+ else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ Op.getExpr()->print(O, &MAI);
+ }
+}
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+ switch (Ty.SimpleTy) {
+ case MVT::i32:
+ return "i32";
+ case MVT::i64:
+ return "i64";
+ case MVT::f32:
+ return "f32";
+ case MVT::f64:
+ return "f64";
+ default:
+ llvm_unreachable("unsupported type");
+ }
}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 70fcef214ce2..39a16f59fd78 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -16,14 +16,13 @@
#define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineValueType.h"
namespace llvm {
-class MCOperand;
class MCSubtargetInfo;
-class WebAssemblyInstPrinter : public MCInstPrinter {
+class WebAssemblyInstPrinter final : public MCInstPrinter {
public:
WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI);
@@ -31,8 +30,21 @@ public:
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
const MCSubtargetInfo &STI) override;
+
+ // Used by tblegen code.
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
};
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
} // end namespace llvm
#endif
diff --git a/lib/Target/WebAssembly/LLVMBuild.txt b/lib/Target/WebAssembly/LLVMBuild.txt
index 04ef9c4e4bcf..9c4d6dcb35a3 100644
--- a/lib/Target/WebAssembly/LLVMBuild.txt
+++ b/lib/Target/WebAssembly/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
type = Library
name = WebAssemblyCodeGen
parent = WebAssembly
-required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target WebAssemblyDesc WebAssemblyInfo
+required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target WebAssemblyAsmPrinter WebAssemblyDesc WebAssemblyInfo
add_to_library_groups = WebAssembly
diff --git a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
index ccc0f0d7ccbc..c8d1d821861a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,7 @@
add_llvm_library(LLVMWebAssemblyDesc
+ WebAssemblyAsmBackend.cpp
+ WebAssemblyELFObjectWriter.cpp
WebAssemblyMCAsmInfo.cpp
+ WebAssemblyMCCodeEmitter.cpp
WebAssemblyMCTargetDesc.cpp
)
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
new file mode 100644
index 000000000000..b158ccb46f99
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -0,0 +1,103 @@
+//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyAsmBackend class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+ bool Is64Bit;
+
+public:
+ explicit WebAssemblyAsmBackend(bool Is64Bit)
+ : MCAsmBackend(), Is64Bit(Is64Bit) {}
+ ~WebAssemblyAsmBackend() override {}
+
+ void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+ uint64_t Value, bool IsPCRel) const override;
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+ // No instruction requires relaxation
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
+ return false;
+ }
+
+ unsigned getNumFixupKinds() const override {
+ // We currently just use the generic fixups in MCFixup.h and don't have any
+ // target-specific fixups.
+ return 0;
+ }
+
+ bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+ void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
+ MCObjectWriter *OW) const {
+ if (Count == 0)
+ return true;
+
+ // FIXME: Do something.
+ return false;
+}
+
+void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+ unsigned DataSize, uint64_t Value,
+ bool IsPCRel) const {
+ const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+ unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8);
+ if (!Value)
+ return; // Doesn't change encoding.
+
+ // Shift the value into position.
+ Value <<= Info.TargetOffset;
+
+ unsigned Offset = Fixup.getOffset();
+ assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+ // For each byte of the fragment that the fixup touches, mask in the
+ // bits from the fixup value.
+ for (unsigned i = 0; i != NumBytes; ++i)
+ Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+ return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Target &T,
+ const MCRegisterInfo &MRI,
+ const Triple &TT,
+ StringRef CPU) {
+ return new WebAssemblyAsmBackend(TT.isArch64Bit());
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
new file mode 100644
index 000000000000..c47a3d9094e5
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -0,0 +1,54 @@
+//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles ELF-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
+public:
+ WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+protected:
+ unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+// FIXME: Use EM_NONE as a temporary hack. Should we decide to pursue ELF
+// writing seriously, we should email generic-abi@googlegroups.com and ask
+// for our own ELF code.
+WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
+ uint8_t OSABI)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_NONE,
+ /*HasRelocationAddend=*/true) {}
+
+unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ // FIXME: Do we need our own relocs?
+ return Fixup.getKind();
+}
+
+MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ uint8_t OSABI) {
+ MCELFObjectTargetWriter *MOTW =
+ new WebAssemblyELFObjectWriter(Is64Bit, OSABI);
+ return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 55346f71c6fc..d2617796ca99 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
- PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit();
+ PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
// TODO: What should MaxInstLength be?
@@ -41,9 +41,6 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
COMMDirectiveAlignmentIsInBytes = false;
LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
- HasDotTypeDotSizeDirective = false;
- HasSingleParameterDotFile = false;
-
SupportsDebugInformation = true;
// For now, WebAssembly does not support exceptions.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index d2b8fb7748fc..2dcf2cd3c892 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -15,13 +15,13 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
class Triple;
-class WebAssemblyMCAsmInfo final : public MCAsmInfo {
+class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
public:
explicit WebAssemblyMCAsmInfo(const Triple &T);
~WebAssemblyMCAsmInfo() override;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
new file mode 100644
index 000000000000..7c6c79eb5db2
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -0,0 +1,100 @@
+//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
+ const MCRegisterInfo &MRI;
+
+public:
+ WebAssemblyMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri,
+ MCContext &)
+ : MRI(mri) {}
+
+ ~WebAssemblyMCCodeEmitter() override {}
+
+ /// TableGen'erated function for getting the binary encoding for an
+ /// instruction.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Return binary encoding of operand. If the machine operand requires
+ /// relocation, record the relocation and return zero.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new WebAssemblyMCCodeEmitter(MCII, MRI, Ctx);
+}
+
+unsigned WebAssemblyMCCodeEmitter::getMachineOpValue(
+ const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return MRI.getEncodingValue(MO.getReg());
+ if (MO.isImm())
+ return static_cast<unsigned>(MO.getImm());
+
+ assert(MO.isExpr());
+
+ assert(MO.getExpr()->getKind() == MCExpr::SymbolRef);
+
+ assert(false && "FIXME: not implemented yet");
+
+ return 0;
+}
+
+void WebAssemblyMCCodeEmitter::encodeInstruction(
+ const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(false && "FIXME: not implemented yet");
+}
+
+// Encode WebAssembly Memory Operand
+uint64_t
+WebAssemblyMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(false && "FIXME: not implemented yet");
+ return 0;
+}
+
+#include "WebAssemblyGenMCCodeEmitter.inc"
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 224aa773a80e..14cd295353d5 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -26,25 +26,40 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-mc-target-desc"
+#define GET_INSTRINFO_MC_DESC
+#include "WebAssemblyGenInstrInfo.inc"
+
#define GET_SUBTARGETINFO_MC_DESC
#include "WebAssemblyGenSubtargetInfo.inc"
#define GET_REGINFO_MC_DESC
#include "WebAssemblyGenRegisterInfo.inc"
-static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI,
+static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo & /*MRI*/,
const Triple &TT) {
- MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT);
- return MAI;
+ return new WebAssemblyMCAsmInfo(TT);
+}
+
+static MCInstrInfo *createWebAssemblyMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitWebAssemblyMCInstrInfo(X);
+ return X;
+}
+
+static MCStreamer *createWebAssemblyMCStreamer(const Triple &T, MCContext &Ctx,
+ MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
}
static MCInstPrinter *
-createWebAssemblyMCInstPrinter(const Triple &T, unsigned SyntaxVariant,
+createWebAssemblyMCInstPrinter(const Triple & /*T*/, unsigned SyntaxVariant,
const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI) {
- if (SyntaxVariant == 0 || SyntaxVariant == 1)
- return new WebAssemblyInstPrinter(MAI, MII, MRI);
- return nullptr;
+ assert(SyntaxVariant == 0);
+ return new WebAssemblyInstPrinter(MAI, MII, MRI);
}
// Force static initialization.
@@ -53,7 +68,19 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
// Register the MC asm info.
RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo);
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(*T, createWebAssemblyMCInstrInfo);
+
+ // Register the object streamer
+ TargetRegistry::RegisterELFStreamer(*T, createWebAssemblyMCStreamer);
+
// Register the MCInstPrinter.
TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter);
+
+ // Register the MC code emitter
+ TargetRegistry::RegisterMCCodeEmitter(*T, createWebAssemblyMCCodeEmitter);
+
+ // Register the ASM Backend
+ TargetRegistry::RegisterMCAsmBackend(*T, createWebAssemblyAsmBackend);
}
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index eebf5b72f62b..e78f73e3da95 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -16,7 +16,6 @@
#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
#include "llvm/Support/DataTypes.h"
-#include <string>
namespace llvm {
@@ -34,13 +33,21 @@ class StringRef;
class Target;
class Triple;
class raw_ostream;
+class raw_pwrite_stream;
extern Target TheWebAssemblyTarget32;
extern Target TheWebAssemblyTarget64;
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+
MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
- StringRef TT, StringRef CPU);
+ const Triple &TT, StringRef CPU);
+
+MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit, uint8_t OSABI);
} // end namespace llvm
@@ -50,6 +57,11 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
#define GET_REGINFO_ENUM
#include "WebAssemblyGenRegisterInfo.inc"
+// Defines symbolic names for the WebAssembly instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "WebAssemblyGenInstrInfo.inc"
+
#define GET_SUBTARGETINFO_ENUM
#include "WebAssemblyGenSubtargetInfo.inc"
diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile
index f102d73f6e86..ccf63f0be554 100644
--- a/lib/Target/WebAssembly/Makefile
+++ b/lib/Target/WebAssembly/Makefile
@@ -12,8 +12,14 @@ LIBRARYNAME = LLVMWebAssemblyCodeGen
TARGET = WebAssembly
# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = WebAssemblyGenRegisterInfo.inc WebAssemblyGenSubtargetInfo.inc \
- WebAssemblyGenMCCodeEmitter.inc
+BUILT_SOURCES = \
+ WebAssemblyGenAsmWriter.inc \
+ WebAssemblyGenDAGISel.inc \
+ WebAssemblyGenFastISel.inc \
+ WebAssemblyGenInstrInfo.inc \
+ WebAssemblyGenMCCodeEmitter.inc \
+ WebAssemblyGenRegisterInfo.inc \
+ WebAssemblyGenSubtargetInfo.inc
DIRS = InstPrinter TargetInfo MCTargetDesc
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 63e02c455895..b97ea454165c 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -12,6 +12,16 @@ binary encoding of WebAssembly itself:
* https://github.com/WebAssembly/design/blob/master/AstSemantics.md
* https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+The backend is built, tested and archived on the following waterfall:
+ https://build.chromium.org/p/client.wasm.llvm/console
+
+The backend's bringup is done using the GCC torture test suite first since it
+doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite. The tests can be run locally using:
+ github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
+
Interesting work that remains to be done:
* Write a pass to restructurize irreducible control flow. This needs to be done
before register allocation to be efficient, because it may duplicate basic
@@ -19,8 +29,60 @@ Interesting work that remains to be done:
level. Note that LLVM's GPU code has such a pass, but it linearizes control
flow (e.g. both sides of branches execute and are masked) which is undesirable
for WebAssembly.
-* Basic relooper to expose control flow as an AST.
-* Figure out how to properly use MC for virtual ISAs. This may require some
- refactoring of MC.
+
+//===---------------------------------------------------------------------===//
+
+set_local instructions have a return value. We should (a) model this,
+and (b) write optimizations which take advantage of it. Keep in mind that
+many set_local instructions are implicit!
+
+//===---------------------------------------------------------------------===//
+
+Br, br_if, and tableswitch instructions can support having a value on the
+expression stack across the jump (sometimes). We should (a) model this, and
+(b) extend the stackifier to utilize it.
+
+//===---------------------------------------------------------------------===//
+
+The min/max operators aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the EXPR_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
+even when they are translated through intrinsics.
//===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/Relooper.cpp b/lib/Target/WebAssembly/Relooper.cpp
new file mode 100644
index 000000000000..9b718ef094aa
--- /dev/null
+++ b/lib/Target/WebAssembly/Relooper.cpp
@@ -0,0 +1,984 @@
+//===-- Relooper.cpp - Top-level interface for WebAssembly ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This implements the Relooper algorithm. This implementation includes
+/// optimizations added since the original academic paper [1] was published.
+///
+/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
+/// Proceedings of the ACM international conference companion on Object
+/// oriented programming systems languages and applications companion
+/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
+/// http://doi.acm.org/10.1145/2048147.2048224
+///
+//===-------------------------------------------------------------------===//
+
+#include "Relooper.h"
+#include "WebAssembly.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <functional>
+#include <list>
+#include <stack>
+#include <string>
+
+#define DEBUG_TYPE "relooper"
+
+using namespace llvm;
+using namespace Relooper;
+
+static cl::opt<int> RelooperSplittingFactor(
+ "relooper-splitting-factor",
+ cl::desc(
+ "How much to discount code size when deciding whether to split a node"),
+ cl::init(5));
+
+static cl::opt<unsigned> RelooperMultipleSwitchThreshold(
+ "relooper-multiple-switch-threshold",
+ cl::desc(
+ "How many entries to allow in a multiple before we use a switch"),
+ cl::init(10));
+
+static cl::opt<unsigned> RelooperNestingLimit(
+ "relooper-nesting-limit",
+ cl::desc(
+ "How much nesting is acceptable"),
+ cl::init(20));
+
+
+namespace {
+///
+/// Implements the relooper algorithm for a function's blocks.
+///
+/// Implementation details: The Relooper instance has
+/// ownership of the blocks and shapes, and frees them when done.
+///
+struct RelooperAlgorithm {
+ std::deque<Block *> Blocks;
+ std::deque<Shape *> Shapes;
+ Shape *Root;
+ bool MinSize;
+ int BlockIdCounter;
+ int ShapeIdCounter;
+
+ RelooperAlgorithm();
+ ~RelooperAlgorithm();
+
+ void AddBlock(Block *New, int Id = -1);
+
+ // Calculates the shapes
+ void Calculate(Block *Entry);
+
+ // Sets us to try to minimize size
+ void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
+};
+
+struct RelooperAnalysis final : public FunctionPass {
+ static char ID;
+ RelooperAnalysis() : FunctionPass(ID) {}
+ const char *getPassName() const override { return "relooper"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ bool runOnFunction(Function &F) override;
+};
+}
+
+// RelooperAnalysis
+
+char RelooperAnalysis::ID = 0;
+FunctionPass *llvm::createWebAssemblyRelooper() {
+ return new RelooperAnalysis();
+}
+
+bool RelooperAnalysis::runOnFunction(Function &F) {
+ DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n");
+ RelooperAlgorithm R;
+ // FIXME: remove duplication between relooper's and LLVM's BBs.
+ std::map<const BasicBlock *, Block *> BB2B;
+ std::map<const Block *, const BasicBlock *> B2BB;
+ for (const BasicBlock &BB : F) {
+ // FIXME: getName is wrong here, Code is meant to represent amount of code.
+ // FIXME: use BranchVarInit for switch.
+ Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr);
+ R.AddBlock(B);
+ assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice");
+ assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice");
+ BB2B[&BB] = B;
+ B2BB[B] = &BB;
+ }
+ for (Block *B : R.Blocks) {
+ const BasicBlock *BB = B2BB[B];
+ for (const BasicBlock *Successor : successors(BB))
+ // FIXME: add branch's Condition and Code below.
+ B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr);
+ }
+ R.Calculate(BB2B[&F.getEntryBlock()]);
+ return false; // Analysis passes don't modify anything.
+}
+
+// Helpers
+
+typedef MapVector<Block *, BlockSet> BlockBlockSetMap;
+typedef std::list<Block *> BlockList;
+
+template <class T, class U>
+static bool contains(const T &container, const U &contained) {
+ return container.count(contained);
+}
+
+
+// Branch
+
+Branch::Branch(const char *ConditionInit, const char *CodeInit)
+ : Ancestor(nullptr), Labeled(true) {
+ // FIXME: move from char* to LLVM data structures
+ Condition = ConditionInit ? strdup(ConditionInit) : nullptr;
+ Code = CodeInit ? strdup(CodeInit) : nullptr;
+}
+
+Branch::~Branch() {
+ // FIXME: move from char* to LLVM data structures
+ free(static_cast<void *>(const_cast<char *>(Condition)));
+ free(static_cast<void *>(const_cast<char *>(Code)));
+}
+
+// Block
+
+Block::Block(const char *CodeInit, const char *BranchVarInit)
+ : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) {
+ // FIXME: move from char* to LLVM data structures
+ Code = strdup(CodeInit);
+ BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr;
+}
+
+Block::~Block() {
+ // FIXME: move from char* to LLVM data structures
+ free(static_cast<void *>(const_cast<char *>(Code)));
+ free(static_cast<void *>(const_cast<char *>(BranchVar)));
+}
+
+void Block::AddBranchTo(Block *Target, const char *Condition,
+ const char *Code) {
+ assert(!contains(BranchesOut, Target) &&
+ "cannot add more than one branch to the same target");
+ BranchesOut[Target] = make_unique<Branch>(Condition, Code);
+}
+
+// Relooper
+
+RelooperAlgorithm::RelooperAlgorithm()
+ : Root(nullptr), MinSize(false), BlockIdCounter(1),
+ ShapeIdCounter(0) { // block ID 0 is reserved for clearings
+}
+
+RelooperAlgorithm::~RelooperAlgorithm() {
+ for (auto Curr : Blocks)
+ delete Curr;
+ for (auto Curr : Shapes)
+ delete Curr;
+}
+
+void RelooperAlgorithm::AddBlock(Block *New, int Id) {
+ New->Id = Id == -1 ? BlockIdCounter++ : Id;
+ Blocks.push_back(New);
+}
+
+struct RelooperRecursor {
+ RelooperAlgorithm *Parent;
+ RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
+};
+
+void RelooperAlgorithm::Calculate(Block *Entry) {
+ // Scan and optimize the input
+ struct PreOptimizer : public RelooperRecursor {
+ PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
+ BlockSet Live;
+
+ void FindLive(Block *Root) {
+ BlockList ToInvestigate;
+ ToInvestigate.push_back(Root);
+ while (!ToInvestigate.empty()) {
+ Block *Curr = ToInvestigate.front();
+ ToInvestigate.pop_front();
+ if (contains(Live, Curr))
+ continue;
+ Live.insert(Curr);
+ for (const auto &iter : Curr->BranchesOut)
+ ToInvestigate.push_back(iter.first);
+ }
+ }
+
+ // If a block has multiple entries but no exits, and it is small enough, it
+ // is useful to split it. A common example is a C++ function where
+ // everything ends up at a final exit block and does some RAII cleanup.
+ // Without splitting, we will be forced to introduce labelled loops to
+ // allow reaching the final block
+ void SplitDeadEnds() {
+ unsigned TotalCodeSize = 0;
+ for (const auto &Curr : Live) {
+ TotalCodeSize += strlen(Curr->Code);
+ }
+ BlockSet Splits;
+ BlockSet Removed;
+ for (const auto &Original : Live) {
+ if (Original->BranchesIn.size() <= 1 ||
+ !Original->BranchesOut.empty())
+ continue; // only dead ends, for now
+ if (contains(Original->BranchesOut, Original))
+ continue; // cannot split a looping node
+ if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) >
+ TotalCodeSize / RelooperSplittingFactor)
+ continue; // if splitting increases raw code size by a significant
+ // amount, abort
+ // Split the node (for simplicity, we replace all the blocks, even
+ // though we could have reused the original)
+ DEBUG(dbgs() << " Splitting '" << Original->Code << "'\n");
+ for (const auto &Prior : Original->BranchesIn) {
+ Block *Split = new Block(Original->Code, Original->BranchVar);
+ Parent->AddBlock(Split, Original->Id);
+ Split->BranchesIn.insert(Prior);
+ std::unique_ptr<Branch> Details;
+ Details.swap(Prior->BranchesOut[Original]);
+ Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition,
+ Details->Code);
+ for (const auto &iter : Original->BranchesOut) {
+ Block *Post = iter.first;
+ Branch *Details = iter.second.get();
+ Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition,
+ Details->Code);
+ Post->BranchesIn.insert(Split);
+ }
+ Splits.insert(Split);
+ Removed.insert(Original);
+ }
+ for (const auto &iter : Original->BranchesOut) {
+ Block *Post = iter.first;
+ Post->BranchesIn.remove(Original);
+ }
+ }
+ for (const auto &iter : Splits)
+ Live.insert(iter);
+ for (const auto &iter : Removed)
+ Live.remove(iter);
+ }
+ };
+ PreOptimizer Pre(this);
+ Pre.FindLive(Entry);
+
+ // Add incoming branches from live blocks, ignoring dead code
+ for (unsigned i = 0; i < Blocks.size(); i++) {
+ Block *Curr = Blocks[i];
+ if (!contains(Pre.Live, Curr))
+ continue;
+ for (const auto &iter : Curr->BranchesOut)
+ iter.first->BranchesIn.insert(Curr);
+ }
+
+ if (!MinSize)
+ Pre.SplitDeadEnds();
+
+ // Recursively process the graph
+
+ struct Analyzer : public RelooperRecursor {
+ Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
+
+ // Add a shape to the list of shapes in this Relooper calculation
+ void Notice(Shape *New) {
+ New->Id = Parent->ShapeIdCounter++;
+ Parent->Shapes.push_back(New);
+ }
+
+ // Create a list of entries from a block. If LimitTo is provided, only
+ // results in that set will appear
+ void GetBlocksOut(Block *Source, BlockSet &Entries,
+ BlockSet *LimitTo = nullptr) {
+ for (const auto &iter : Source->BranchesOut)
+ if (!LimitTo || contains(*LimitTo, iter.first))
+ Entries.insert(iter.first);
+ }
+
+ // Converts/processes all branchings to a specific target
+ void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor,
+ BlockSet &From) {
+ DEBUG(dbgs() << " Solipsize '" << Target->Code << "' type " << Type
+ << "\n");
+ for (auto iter = Target->BranchesIn.begin();
+ iter != Target->BranchesIn.end();) {
+ Block *Prior = *iter;
+ if (!contains(From, Prior)) {
+ iter++;
+ continue;
+ }
+ std::unique_ptr<Branch> PriorOut;
+ PriorOut.swap(Prior->BranchesOut[Target]);
+ PriorOut->Ancestor = Ancestor;
+ PriorOut->Type = Type;
+ if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor))
+ Multiple->Breaks++; // We are breaking out of this Multiple, so need a
+ // loop
+ iter++; // carefully increment iter before erasing
+ Target->BranchesIn.remove(Prior);
+ Target->ProcessedBranchesIn.insert(Prior);
+ Prior->ProcessedBranchesOut[Target].swap(PriorOut);
+ }
+ }
+
+ Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
+ DEBUG(dbgs() << " MakeSimple inner block '" << Inner->Code << "'\n");
+ SimpleShape *Simple = new SimpleShape;
+ Notice(Simple);
+ Simple->Inner = Inner;
+ Inner->Parent = Simple;
+ if (Blocks.size() > 1) {
+ Blocks.remove(Inner);
+ GetBlocksOut(Inner, NextEntries, &Blocks);
+ BlockSet JustInner;
+ JustInner.insert(Inner);
+ for (const auto &iter : NextEntries)
+ Solipsize(iter, Branch::Direct, Simple, JustInner);
+ }
+ return Simple;
+ }
+
+ Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries,
+ BlockSet &NextEntries) {
+ // Find the inner blocks in this loop. Proceed backwards from the entries
+ // until
+ // you reach a seen block, collecting as you go.
+ BlockSet InnerBlocks;
+ BlockSet Queue = Entries;
+ while (!Queue.empty()) {
+ Block *Curr = *(Queue.begin());
+ Queue.remove(*Queue.begin());
+ if (!contains(InnerBlocks, Curr)) {
+ // This element is new, mark it as inner and remove from outer
+ InnerBlocks.insert(Curr);
+ Blocks.remove(Curr);
+ // Add the elements prior to it
+ for (const auto &iter : Curr->BranchesIn)
+ Queue.insert(iter);
+ }
+ }
+ assert(!InnerBlocks.empty());
+
+ for (const auto &Curr : InnerBlocks) {
+ for (const auto &iter : Curr->BranchesOut) {
+ Block *Possible = iter.first;
+ if (!contains(InnerBlocks, Possible))
+ NextEntries.insert(Possible);
+ }
+ }
+
+ LoopShape *Loop = new LoopShape();
+ Notice(Loop);
+
+ // Solipsize the loop, replacing with break/continue and marking branches
+ // as Processed (will not affect later calculations)
+ // A. Branches to the loop entries become a continue to this shape
+ for (const auto &iter : Entries)
+ Solipsize(iter, Branch::Continue, Loop, InnerBlocks);
+ // B. Branches to outside the loop (a next entry) become breaks on this
+ // shape
+ for (const auto &iter : NextEntries)
+ Solipsize(iter, Branch::Break, Loop, InnerBlocks);
+ // Finish up
+ Shape *Inner = Process(InnerBlocks, Entries, nullptr);
+ Loop->Inner = Inner;
+ return Loop;
+ }
+
+ // For each entry, find the independent group reachable by it. The
+ // independent group is the entry itself, plus all the blocks it can
+ // reach that cannot be directly reached by another entry. Note that we
+ // ignore directly reaching the entry itself by another entry.
+ // @param Ignore - previous blocks that are irrelevant
+ void FindIndependentGroups(BlockSet &Entries,
+ BlockBlockSetMap &IndependentGroups,
+ BlockSet *Ignore = nullptr) {
+ typedef std::map<Block *, Block *> BlockBlockMap;
+
+ struct HelperClass {
+ BlockBlockSetMap &IndependentGroups;
+ BlockBlockMap Ownership; // For each block, which entry it belongs to.
+ // We have reached it from there.
+
+ HelperClass(BlockBlockSetMap &IndependentGroupsInit)
+ : IndependentGroups(IndependentGroupsInit) {}
+ void InvalidateWithChildren(Block *New) {
+ // Being in the list means you need to be invalidated
+ BlockList ToInvalidate;
+ ToInvalidate.push_back(New);
+ while (!ToInvalidate.empty()) {
+ Block *Invalidatee = ToInvalidate.front();
+ ToInvalidate.pop_front();
+ Block *Owner = Ownership[Invalidatee];
+ // Owner may have been invalidated, do not add to
+ // IndependentGroups!
+ if (contains(IndependentGroups, Owner))
+ IndependentGroups[Owner].remove(Invalidatee);
+ if (Ownership[Invalidatee]) { // may have been seen before and
+ // invalidated already
+ Ownership[Invalidatee] = nullptr;
+ for (const auto &iter : Invalidatee->BranchesOut) {
+ Block *Target = iter.first;
+ BlockBlockMap::iterator Known = Ownership.find(Target);
+ if (Known != Ownership.end()) {
+ Block *TargetOwner = Known->second;
+ if (TargetOwner)
+ ToInvalidate.push_back(Target);
+ }
+ }
+ }
+ }
+ }
+ };
+ HelperClass Helper(IndependentGroups);
+
+ // We flow out from each of the entries, simultaneously.
+ // When we reach a new block, we add it as belonging to the one we got to
+ // it from.
+ // If we reach a new block that is already marked as belonging to someone,
+ // it is reachable by two entries and is not valid for any of them.
+ // Remove it and all it can reach that have been visited.
+
+ // Being in the queue means we just added this item, and
+ // we need to add its children
+ BlockList Queue;
+ for (const auto &Entry : Entries) {
+ Helper.Ownership[Entry] = Entry;
+ IndependentGroups[Entry].insert(Entry);
+ Queue.push_back(Entry);
+ }
+ while (!Queue.empty()) {
+ Block *Curr = Queue.front();
+ Queue.pop_front();
+ Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership
+ // map if we are in the queue
+ if (!Owner)
+ continue; // we have been invalidated meanwhile after being reached
+ // from two entries
+ // Add all children
+ for (const auto &iter : Curr->BranchesOut) {
+ Block *New = iter.first;
+ BlockBlockMap::iterator Known = Helper.Ownership.find(New);
+ if (Known == Helper.Ownership.end()) {
+ // New node. Add it, and put it in the queue
+ Helper.Ownership[New] = Owner;
+ IndependentGroups[Owner].insert(New);
+ Queue.push_back(New);
+ continue;
+ }
+ Block *NewOwner = Known->second;
+ if (!NewOwner)
+ continue; // We reached an invalidated node
+ if (NewOwner != Owner)
+ // Invalidate this and all reachable that we have seen - we reached
+ // this from two locations
+ Helper.InvalidateWithChildren(New);
+ // otherwise, we have the same owner, so do nothing
+ }
+ }
+
+ // Having processed all the interesting blocks, we remain with just one
+ // potential issue:
+ // If a->b, and a was invalidated, but then b was later reached by
+ // someone else, we must invalidate b. To check for this, we go over all
+ // elements in the independent groups, if an element has a parent which
+ // does *not* have the same owner, we/ must remove it and all its
+ // children.
+
+ for (const auto &iter : Entries) {
+ BlockSet &CurrGroup = IndependentGroups[iter];
+ BlockList ToInvalidate;
+ for (const auto &iter : CurrGroup) {
+ Block *Child = iter;
+ for (const auto &iter : Child->BranchesIn) {
+ Block *Parent = iter;
+ if (Ignore && contains(*Ignore, Parent))
+ continue;
+ if (Helper.Ownership[Parent] != Helper.Ownership[Child])
+ ToInvalidate.push_back(Child);
+ }
+ }
+ while (!ToInvalidate.empty()) {
+ Block *Invalidatee = ToInvalidate.front();
+ ToInvalidate.pop_front();
+ Helper.InvalidateWithChildren(Invalidatee);
+ }
+ }
+
+ // Remove empty groups
+ for (const auto &iter : Entries)
+ if (IndependentGroups[iter].empty())
+ IndependentGroups.erase(iter);
+ }
+
+ Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries,
+ BlockBlockSetMap &IndependentGroups, Shape *Prev,
+ BlockSet &NextEntries) {
+ bool Fused = isa<SimpleShape>(Prev);
+ MultipleShape *Multiple = new MultipleShape();
+ Notice(Multiple);
+ BlockSet CurrEntries;
+ for (auto &iter : IndependentGroups) {
+ Block *CurrEntry = iter.first;
+ BlockSet &CurrBlocks = iter.second;
+ // Create inner block
+ CurrEntries.clear();
+ CurrEntries.insert(CurrEntry);
+ for (const auto &CurrInner : CurrBlocks) {
+ // Remove the block from the remaining blocks
+ Blocks.remove(CurrInner);
+ // Find new next entries and fix branches to them
+ for (auto iter = CurrInner->BranchesOut.begin();
+ iter != CurrInner->BranchesOut.end();) {
+ Block *CurrTarget = iter->first;
+ auto Next = iter;
+ Next++;
+ if (!contains(CurrBlocks, CurrTarget)) {
+ NextEntries.insert(CurrTarget);
+ Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks);
+ }
+ iter = Next; // increment carefully because Solipsize can remove us
+ }
+ }
+ Multiple->InnerMap[CurrEntry->Id] =
+ Process(CurrBlocks, CurrEntries, nullptr);
+ // If we are not fused, then our entries will actually be checked
+ if (!Fused)
+ CurrEntry->IsCheckedMultipleEntry = true;
+ }
+ // Add entries not handled as next entries, they are deferred
+ for (const auto &Entry : Entries)
+ if (!contains(IndependentGroups, Entry))
+ NextEntries.insert(Entry);
+ // The multiple has been created, we can decide how to implement it
+ if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) {
+ Multiple->UseSwitch = true;
+ Multiple->Breaks++; // switch captures breaks
+ }
+ return Multiple;
+ }
+
+ // Main function.
+ // Process a set of blocks with specified entries, returns a shape
+ // The Make* functions receive a NextEntries. If they fill it with data,
+ // those are the entries for the ->Next block on them, and the blocks
+ // are what remains in Blocks (which Make* modify). In this way
+ // we avoid recursing on Next (imagine a long chain of Simples, if we
+ // recursed we could blow the stack).
+ Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) {
+ BlockSet *Entries = &InitialEntries;
+ BlockSet TempEntries[2];
+ int CurrTempIndex = 0;
+ BlockSet *NextEntries;
+ Shape *Ret = nullptr;
+
+ auto Make = [&](Shape *Temp) {
+ if (Prev)
+ Prev->Next = Temp;
+ if (!Ret)
+ Ret = Temp;
+ Prev = Temp;
+ Entries = NextEntries;
+ };
+
+ while (1) {
+ CurrTempIndex = 1 - CurrTempIndex;
+ NextEntries = &TempEntries[CurrTempIndex];
+ NextEntries->clear();
+
+ if (Entries->empty())
+ return Ret;
+ if (Entries->size() == 1) {
+ Block *Curr = *(Entries->begin());
+ if (Curr->BranchesIn.empty()) {
+ // One entry, no looping ==> Simple
+ Make(MakeSimple(Blocks, Curr, *NextEntries));
+ if (NextEntries->empty())
+ return Ret;
+ continue;
+ }
+ // One entry, looping ==> Loop
+ Make(MakeLoop(Blocks, *Entries, *NextEntries));
+ if (NextEntries->empty())
+ return Ret;
+ continue;
+ }
+
+ // More than one entry, try to eliminate through a Multiple groups of
+ // independent blocks from an entry/ies. It is important to remove
+ // through multiples as opposed to looping since the former is more
+ // performant.
+ BlockBlockSetMap IndependentGroups;
+ FindIndependentGroups(*Entries, IndependentGroups);
+
+ if (!IndependentGroups.empty()) {
+ // We can handle a group in a multiple if its entry cannot be reached
+ // by another group.
+ // Note that it might be reachable by itself - a loop. But that is
+ // fine, we will create a loop inside the multiple block (which
+ // is the performant order to do it).
+ for (auto iter = IndependentGroups.begin();
+ iter != IndependentGroups.end();) {
+ Block *Entry = iter->first;
+ BlockSet &Group = iter->second;
+ auto curr = iter++; // iterate carefully, we may delete
+ for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin();
+ iterBranch != Entry->BranchesIn.end(); iterBranch++) {
+ Block *Origin = *iterBranch;
+ if (!contains(Group, Origin)) {
+ // Reached from outside the group, so we cannot handle this
+ IndependentGroups.erase(curr);
+ break;
+ }
+ }
+ }
+
+ // As an optimization, if we have 2 independent groups, and one is a
+ // small dead end, we can handle only that dead end.
+ // The other then becomes a Next - without nesting in the code and
+ // recursion in the analysis.
+ // TODO: if the larger is the only dead end, handle that too
+ // TODO: handle >2 groups
+ // TODO: handle not just dead ends, but also that do not branch to the
+ // NextEntries. However, must be careful there since we create a
+ // Next, and that Next can prevent eliminating a break (since we no
+ // longer naturally reach the same place), which may necessitate a
+ // one-time loop, which makes the unnesting pointless.
+ if (IndependentGroups.size() == 2) {
+ // Find the smaller one
+ auto iter = IndependentGroups.begin();
+ Block *SmallEntry = iter->first;
+ auto SmallSize = iter->second.size();
+ iter++;
+ Block *LargeEntry = iter->first;
+ auto LargeSize = iter->second.size();
+ if (SmallSize != LargeSize) { // ignore the case where they are
+ // identical - keep things symmetrical
+ // there
+ if (SmallSize > LargeSize) {
+ Block *Temp = SmallEntry;
+ SmallEntry = LargeEntry;
+ LargeEntry = Temp; // Note: we did not flip the Sizes too, they
+ // are now invalid. TODO: use the smaller
+ // size as a limit?
+ }
+ // Check if dead end
+ bool DeadEnd = true;
+ BlockSet &SmallGroup = IndependentGroups[SmallEntry];
+ for (const auto &Curr : SmallGroup) {
+ for (const auto &iter : Curr->BranchesOut) {
+ Block *Target = iter.first;
+ if (!contains(SmallGroup, Target)) {
+ DeadEnd = false;
+ break;
+ }
+ }
+ if (!DeadEnd)
+ break;
+ }
+ if (DeadEnd)
+ IndependentGroups.erase(LargeEntry);
+ }
+ }
+
+ if (!IndependentGroups.empty())
+ // Some groups removable ==> Multiple
+ Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev,
+ *NextEntries));
+ if (NextEntries->empty())
+ return Ret;
+ continue;
+ }
+ // No independent groups, must be loopable ==> Loop
+ Make(MakeLoop(Blocks, *Entries, *NextEntries));
+ if (NextEntries->empty())
+ return Ret;
+ continue;
+ }
+ }
+ };
+
+ // Main
+
+ BlockSet AllBlocks;
+ for (const auto &Curr : Pre.Live) {
+ AllBlocks.insert(Curr);
+ }
+
+ BlockSet Entries;
+ Entries.insert(Entry);
+ Root = Analyzer(this).Process(AllBlocks, Entries, nullptr);
+ assert(Root);
+
+ ///
+ /// Relooper post-optimizer
+ ///
+ struct PostOptimizer {
+ RelooperAlgorithm *Parent;
+ std::stack<Shape *> LoopStack;
+
+ PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
+
+ void ShapeSwitch(Shape* var,
+ std::function<void (SimpleShape*)> simple,
+ std::function<void (MultipleShape*)> multiple,
+ std::function<void (LoopShape*)> loop) {
+ switch (var->getKind()) {
+ case Shape::SK_Simple: {
+ simple(cast<SimpleShape>(var));
+ break;
+ }
+ case Shape::SK_Multiple: {
+ multiple(cast<MultipleShape>(var));
+ break;
+ }
+ case Shape::SK_Loop: {
+ loop(cast<LoopShape>(var));
+ break;
+ }
+ }
+ }
+
+ // Find the blocks that natural control flow can get us directly to, or
+ // through a multiple that we ignore
+ void FollowNaturalFlow(Shape *S, BlockSet &Out) {
+ ShapeSwitch(S, [&](SimpleShape* Simple) {
+ Out.insert(Simple->Inner);
+ }, [&](MultipleShape* Multiple) {
+ for (const auto &iter : Multiple->InnerMap) {
+ FollowNaturalFlow(iter.second, Out);
+ }
+ FollowNaturalFlow(Multiple->Next, Out);
+ }, [&](LoopShape* Loop) {
+ FollowNaturalFlow(Loop->Inner, Out);
+ });
+ }
+
+ void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) {
+ if (Root->Next) {
+ Root->Natural = Root->Next;
+ FindNaturals(Root->Next, Otherwise);
+ } else {
+ Root->Natural = Otherwise;
+ }
+
+ ShapeSwitch(Root, [](SimpleShape* Simple) {
+ }, [&](MultipleShape* Multiple) {
+ for (const auto &iter : Multiple->InnerMap) {
+ FindNaturals(iter.second, Root->Natural);
+ }
+ }, [&](LoopShape* Loop){
+ FindNaturals(Loop->Inner, Loop->Inner);
+ });
+ }
+
+ // Remove unneeded breaks and continues.
+ // A flow operation is trivially unneeded if the shape we naturally get to
+ // by normal code execution is the same as the flow forces us to.
+ void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr,
+ LoopShape *LastLoop = nullptr,
+ unsigned Depth = 0) {
+ BlockSet NaturalBlocks;
+ FollowNaturalFlow(Natural, NaturalBlocks);
+ Shape *Next = Root;
+ while (Next) {
+ Root = Next;
+ Next = nullptr;
+ ShapeSwitch(
+ Root,
+ [&](SimpleShape* Simple) {
+ if (Simple->Inner->BranchVar)
+ LastLoop =
+ nullptr; // a switch clears out the loop (TODO: only for
+ // breaks, not continue)
+
+ if (Simple->Next) {
+ if (!Simple->Inner->BranchVar &&
+ Simple->Inner->ProcessedBranchesOut.size() == 2 &&
+ Depth < RelooperNestingLimit) {
+ // If there is a next block, we already know at Simple
+ // creation time to make direct branches, and we can do
+ // nothing more in general. But, we try to optimize the
+ // case of a break and a direct: This would normally be
+ // if (break?) { break; } ..
+ // but if we make sure to nest the else, we can save the
+ // break,
+ // if (!break?) { .. }
+ // This is also better because the more canonical nested
+ // form is easier to further optimize later. The
+ // downside is more nesting, which adds to size in builds with
+ // whitespace.
+ // Note that we avoid switches, as it complicates control flow
+ // and is not relevant for the common case we optimize here.
+ bool Found = false;
+ bool Abort = false;
+ for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+ Block *Target = iter.first;
+ Branch *Details = iter.second.get();
+ if (Details->Type == Branch::Break) {
+ Found = true;
+ if (!contains(NaturalBlocks, Target))
+ Abort = true;
+ } else if (Details->Type != Branch::Direct)
+ Abort = true;
+ }
+ if (Found && !Abort) {
+ for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+ Branch *Details = iter.second.get();
+ if (Details->Type == Branch::Break) {
+ Details->Type = Branch::Direct;
+ if (MultipleShape *Multiple =
+ dyn_cast<MultipleShape>(Details->Ancestor))
+ Multiple->Breaks--;
+ } else {
+ assert(Details->Type == Branch::Direct);
+ Details->Type = Branch::Nested;
+ }
+ }
+ }
+ Depth++; // this optimization increases depth, for us and all
+ // our next chain (i.e., until this call returns)
+ }
+ Next = Simple->Next;
+ } else {
+ // If there is no next then Natural is where we will
+ // go to by doing nothing, so we can potentially optimize some
+ // branches to direct.
+ for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+ Block *Target = iter.first;
+ Branch *Details = iter.second.get();
+ if (Details->Type != Branch::Direct &&
+ contains(NaturalBlocks,
+ Target)) { // note: cannot handle split blocks
+ Details->Type = Branch::Direct;
+ if (MultipleShape *Multiple =
+ dyn_cast<MultipleShape>(Details->Ancestor))
+ Multiple->Breaks--;
+ } else if (Details->Type == Branch::Break && LastLoop &&
+ LastLoop->Natural == Details->Ancestor->Natural) {
+ // it is important to simplify breaks, as simpler breaks
+ // enable other optimizations
+ Details->Labeled = false;
+ if (MultipleShape *Multiple =
+ dyn_cast<MultipleShape>(Details->Ancestor))
+ Multiple->Breaks--;
+ }
+ }
+ }
+ }, [&](MultipleShape* Multiple)
+ {
+ for (const auto &iter : Multiple->InnerMap) {
+ RemoveUnneededFlows(iter.second, Multiple->Next,
+ Multiple->Breaks ? nullptr : LastLoop,
+ Depth + 1);
+ }
+ Next = Multiple->Next;
+ }, [&](LoopShape* Loop)
+ {
+ RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1);
+ Next = Loop->Next;
+ });
+ }
+ }
+
+ // After we know which loops exist, we can calculate which need to be
+ // labeled
+ void FindLabeledLoops(Shape *Root) {
+ Shape *Next = Root;
+ while (Next) {
+ Root = Next;
+ Next = nullptr;
+
+ ShapeSwitch(
+ Root,
+ [&](SimpleShape *Simple) {
+ MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next);
+ // If we are fusing a Multiple with a loop into this Simple, then
+ // visit it now
+ if (Fused && Fused->Breaks)
+ LoopStack.push(Fused);
+ if (Simple->Inner->BranchVar)
+ LoopStack.push(nullptr); // a switch means breaks are now useless,
+ // push a dummy
+ if (Fused) {
+ if (Fused->UseSwitch)
+ LoopStack.push(nullptr); // a switch means breaks are now
+ // useless, push a dummy
+ for (const auto &iter : Fused->InnerMap) {
+ FindLabeledLoops(iter.second);
+ }
+ }
+ for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
+ Branch *Details = iter.second.get();
+ if (Details->Type == Branch::Break ||
+ Details->Type == Branch::Continue) {
+ assert(!LoopStack.empty());
+ if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
+ if (MultipleShape *Multiple =
+ dyn_cast<MultipleShape>(Details->Ancestor)) {
+ Multiple->Labeled = true;
+ } else {
+ LoopShape *Loop = cast<LoopShape>(Details->Ancestor);
+ Loop->Labeled = true;
+ }
+ } else {
+ Details->Labeled = false;
+ }
+ }
+ if (Fused && Fused->UseSwitch)
+ LoopStack.pop();
+ if (Simple->Inner->BranchVar)
+ LoopStack.pop();
+ if (Fused && Fused->Breaks)
+ LoopStack.pop();
+ if (Fused)
+ Next = Fused->Next;
+ else
+ Next = Root->Next;
+ }
+ }
+ , [&](MultipleShape* Multiple) {
+ if (Multiple->Breaks)
+ LoopStack.push(Multiple);
+ for (const auto &iter : Multiple->InnerMap)
+ FindLabeledLoops(iter.second);
+ if (Multiple->Breaks)
+ LoopStack.pop();
+ Next = Root->Next;
+ }
+ , [&](LoopShape* Loop) {
+ LoopStack.push(Loop);
+ FindLabeledLoops(Loop->Inner);
+ LoopStack.pop();
+ Next = Root->Next;
+ });
+ }
+ }
+
+ void Process(Shape * Root) {
+ FindNaturals(Root);
+ RemoveUnneededFlows(Root);
+ FindLabeledLoops(Root);
+ }
+ };
+
+ PostOptimizer(this).Process(Root);
+}
diff --git a/lib/Target/WebAssembly/Relooper.h b/lib/Target/WebAssembly/Relooper.h
new file mode 100644
index 000000000000..7c564de82f34
--- /dev/null
+++ b/lib/Target/WebAssembly/Relooper.h
@@ -0,0 +1,186 @@
+//===-- Relooper.h - Top-level interface for WebAssembly ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This defines an optimized C++ implemention of the Relooper
+/// algorithm, originally developed as part of Emscripten, which
+/// generates a structured AST from arbitrary control flow.
+///
+//===-------------------------------------------------------------------===//
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Casting.h"
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <deque>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+
+namespace llvm {
+
+namespace Relooper {
+
+struct Block;
+struct Shape;
+
+///
+/// Info about a branching from one block to another
+///
+struct Branch {
+ enum FlowType {
+ Direct = 0, // We will directly reach the right location through other
+ // means, no need for continue or break
+ Break = 1,
+ Continue = 2,
+ Nested = 3 // This code is directly reached, but we must be careful to
+ // ensure it is nested in an if - it is not reached
+ // unconditionally, other code paths exist alongside it that we need to make
+ // sure do not intertwine
+ };
+ Shape
+ *Ancestor; // If not nullptr, this shape is the relevant one for purposes
+ // of getting to the target block. We break or continue on it
+ Branch::FlowType
+ Type; // If Ancestor is not nullptr, this says whether to break or
+ // continue
+ bool Labeled; // If a break or continue, whether we need to use a label
+ const char *Condition; // The condition for which we branch. For example,
+ // "my_var == 1". Conditions are checked one by one.
+ // One of the conditions should have nullptr as the
+ // condition, in which case it is the default
+ // FIXME: move from char* to LLVM data structures
+ const char *Code; // If provided, code that is run right before the branch is
+ // taken. This is useful for phis
+ // FIXME: move from char* to LLVM data structures
+
+ Branch(const char *ConditionInit, const char *CodeInit = nullptr);
+ ~Branch();
+};
+
+typedef SetVector<Block *> BlockSet;
+typedef MapVector<Block *, Branch *> BlockBranchMap;
+typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap;
+
+///
+/// Represents a basic block of code - some instructions that end with a
+/// control flow modifier (a branch, return or throw).
+///
+struct Block {
+ // Branches become processed after we finish the shape relevant to them. For
+ // example, when we recreate a loop, branches to the loop start become
+ // continues and are now processed. When we calculate what shape to generate
+ // from a set of blocks, we ignore processed branches. Blocks own the Branch
+ // objects they use, and destroy them when done.
+ OwningBlockBranchMap BranchesOut;
+ BlockSet BranchesIn;
+ OwningBlockBranchMap ProcessedBranchesOut;
+ BlockSet ProcessedBranchesIn;
+ Shape *Parent; // The shape we are directly inside
+ int Id; // A unique identifier, defined when added to relooper. Note that this
+ // uniquely identifies a *logical* block - if we split it, the two
+ // instances have the same content *and* the same Id
+ const char *Code; // The string representation of the code in this block.
+ // Owning pointer (we copy the input)
+ // FIXME: move from char* to LLVM data structures
+ const char *BranchVar; // A variable whose value determines where we go; if
+ // this is not nullptr, emit a switch on that variable
+ // FIXME: move from char* to LLVM data structures
+ bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching
+ // us requires setting the label variable
+
+ Block(const char *CodeInit, const char *BranchVarInit);
+ ~Block();
+
+ void AddBranchTo(Block *Target, const char *Condition,
+ const char *Code = nullptr);
+};
+
+///
+/// Represents a structured control flow shape
+///
+struct Shape {
+ int Id; // A unique identifier. Used to identify loops, labels are Lx where x
+ // is the Id. Defined when added to relooper
+ Shape *Next; // The shape that will appear in the code right after this one
+ Shape *Natural; // The shape that control flow gets to naturally (if there is
+ // Next, then this is Next)
+
+ /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
+ enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop };
+
+private:
+ ShapeKind Kind;
+
+public:
+ ShapeKind getKind() const { return Kind; }
+
+ Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {}
+};
+
+///
+/// Simple: No control flow at all, just instructions.
+///
+struct SimpleShape : public Shape {
+ Block *Inner;
+
+ SimpleShape() : Shape(SK_Simple), Inner(nullptr) {}
+
+ static bool classof(const Shape *S) { return S->getKind() == SK_Simple; }
+};
+
+///
+/// A shape that may be implemented with a labeled loop.
+///
+struct LabeledShape : public Shape {
+ bool Labeled; // If we have a loop, whether it needs to be labeled
+
+ LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {}
+};
+
+// Blocks with the same id were split and are identical, so we just care about
+// ids in Multiple entries
+typedef std::map<int, Shape *> IdShapeMap;
+
+///
+/// Multiple: A shape with more than one entry. If the next block to
+/// be entered is among them, we run it and continue to
+/// the next shape, otherwise we continue immediately to the
+/// next shape.
+///
+struct MultipleShape : public LabeledShape {
+ IdShapeMap InnerMap; // entry block ID -> shape
+ int Breaks; // If we have branches on us, we need a loop (or a switch). This
+ // is a counter of requirements,
+ // if we optimize it to 0, the loop is unneeded
+ bool UseSwitch; // Whether to switch on label as opposed to an if-else chain
+
+ MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {}
+
+ static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; }
+};
+
+///
+/// Loop: An infinite loop.
+///
+struct LoopShape : public LabeledShape {
+ Shape *Inner;
+
+ LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {}
+
+ static bool classof(const Shape *S) { return S->getKind() == SK_Loop; }
+};
+
+} // namespace Relooper
+
+} // namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 3ff19d46f437..e972da5af74f 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -23,8 +23,22 @@ namespace llvm {
class WebAssemblyTargetMachine;
class FunctionPass;
+FunctionPass *createWebAssemblyOptimizeReturned();
+
FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
+
+FunctionPass *createWebAssemblyStoreResults();
+FunctionPass *createWebAssemblyRegStackify();
+FunctionPass *createWebAssemblyRegColoring();
+FunctionPass *createWebAssemblyPEI();
+FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
+FunctionPass *createWebAssemblyRegNumbering();
+FunctionPass *createWebAssemblyPeephole();
+
+FunctionPass *createWebAssemblyRelooper();
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index a123bf6f66b6..551ad9345154 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -6,10 +6,11 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This is a target description file for the WebAssembly architecture, which is
-// also known as "wasm".
-//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -50,6 +51,9 @@ def WebAssemblyInstrInfo : InstrInfo;
// Minimal Viable Product.
def : ProcessorModel<"mvp", NoSchedModel, []>;
+// Generic processor: latest stable version.
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
// Latest and greatest experimental version of WebAssembly. Bugs included!
def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>;
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 000000000000..3893c408cf63
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,110 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+ const char *getPassName() const override {
+ return "WebAssembly Argument Move";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+ return new WebAssemblyArgumentMove();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case WebAssembly::ARGUMENT_I32:
+ case WebAssembly::ARGUMENT_I64:
+ case WebAssembly::ARGUMENT_F32:
+ case WebAssembly::ARGUMENT_F64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Argument Move **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ bool Changed = false;
+ MachineBasicBlock &EntryMBB = MF.front();
+ MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+ // Look for the first NonArg instruction.
+ for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
+ MachineInstr *MI = MII;
+ if (!IsArgument(MI)) {
+ InsertPt = MII;
+ break;
+ }
+ }
+
+ // Now move any argument instructions later in the block
+ // to before our first NonArg instruction.
+ for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
+ MachineInstr *MI = I;
+ if (IsArgument(MI)) {
+ EntryMBB.insert(InsertPt, MI->removeFromParent());
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
new file mode 100644
index 000000000000..0d2b4d9debb9
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -0,0 +1,285 @@
+//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a printer that converts from our internal
+/// representation of machine-dependent LLVM code to the WebAssembly assembly
+/// language.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class WebAssemblyAsmPrinter final : public AsmPrinter {
+ const MachineRegisterInfo *MRI;
+ const WebAssemblyFunctionInfo *MFI;
+
+public:
+ WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
+
+private:
+ const char *getPassName() const override {
+ return "WebAssembly Assembly Printer";
+ }
+
+ //===------------------------------------------------------------------===//
+ // MachineFunctionPass Implementation.
+ //===------------------------------------------------------------------===//
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ MRI = &MF.getRegInfo();
+ MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+ return AsmPrinter::runOnMachineFunction(MF);
+ }
+
+ //===------------------------------------------------------------------===//
+ // AsmPrinter Implementation.
+ //===------------------------------------------------------------------===//
+
+ void EmitJumpTableInfo() override;
+ void EmitConstantPool() override;
+ void EmitFunctionBodyStart() override;
+ void EmitInstruction(const MachineInstr *MI) override;
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS) override;
+
+ MVT getRegType(unsigned RegNo) const;
+ const char *toString(MVT VT) const;
+ std::string regToString(const MachineOperand &MO);
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helpers.
+//===----------------------------------------------------------------------===//
+
+MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+ const TargetRegisterClass *TRC =
+ TargetRegisterInfo::isVirtualRegister(RegNo) ?
+ MRI->getRegClass(RegNo) :
+ MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+ for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+ if (TRC->hasType(T))
+ return T;
+ DEBUG(errs() << "Unknown type for register number: " << RegNo);
+ llvm_unreachable("Unknown register type");
+ return MVT::Other;
+}
+
+std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
+ unsigned RegNo = MO.getReg();
+ assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+ "Unlowered physical register encountered during assembly printing");
+ assert(!MFI->isVRegStackified(RegNo));
+ unsigned WAReg = MFI->getWAReg(RegNo);
+ assert(WAReg != WebAssemblyFunctionInfo::UnusedReg);
+ return '$' + utostr(WAReg);
+}
+
+const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
+ return WebAssembly::TypeToString(VT);
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssemblyAsmPrinter Implementation.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+ assert(MF->getConstantPool()->getConstants().empty() &&
+ "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+ // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
+static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+ Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
+ const DataLayout &DL(F.getParent()->getDataLayout());
+ const WebAssemblyTargetLowering &TLI =
+ *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
+ SmallVector<EVT, 4> VTs;
+ ComputeValueVTs(TLI, DL, Ty, VTs);
+
+ for (EVT VT : VTs) {
+ unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
+ MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
+ for (unsigned i = 0; i != NumRegs; ++i)
+ ValueVTs.push_back(RegisterVT);
+ }
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
+ if (!MFI->getParams().empty()) {
+ MCInst Param;
+ Param.setOpcode(WebAssembly::PARAM);
+ for (MVT VT : MFI->getParams())
+ Param.addOperand(MCOperand::createImm(VT.SimpleTy));
+ EmitToStreamer(*OutStreamer, Param);
+ }
+
+ SmallVector<MVT, 4> ResultVTs;
+ const Function &F(*MF->getFunction());
+ ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs);
+ // If the return type needs to be legalized it will get converted into
+ // passing a pointer.
+ if (ResultVTs.size() == 1) {
+ MCInst Result;
+ Result.setOpcode(WebAssembly::RESULT);
+ Result.addOperand(MCOperand::createImm(ResultVTs.front().SimpleTy));
+ EmitToStreamer(*OutStreamer, Result);
+ }
+
+ bool AnyWARegs = false;
+ MCInst Local;
+ Local.setOpcode(WebAssembly::LOCAL);
+ for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+ unsigned WAReg = MFI->getWAReg(VReg);
+ // Don't declare unused registers.
+ if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+ continue;
+ // Don't redeclare parameters.
+ if (WAReg < MFI->getParams().size())
+ continue;
+ // Don't declare stackified registers.
+ if (int(WAReg) < 0)
+ continue;
+ Local.addOperand(MCOperand::createImm(getRegType(VReg).SimpleTy));
+ AnyWARegs = true;
+ }
+ auto &PhysRegs = MFI->getPhysRegs();
+ for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
+ if (PhysRegs[PReg] == -1U)
+ continue;
+ Local.addOperand(MCOperand::createImm(getRegType(PReg).SimpleTy));
+ AnyWARegs = true;
+ }
+ if (AnyWARegs)
+ EmitToStreamer(*OutStreamer, Local);
+
+ AsmPrinter::EmitFunctionBodyStart();
+}
+
+void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+
+ switch (MI->getOpcode()) {
+ case WebAssembly::ARGUMENT_I32:
+ case WebAssembly::ARGUMENT_I64:
+ case WebAssembly::ARGUMENT_F32:
+ case WebAssembly::ARGUMENT_F64:
+ // These represent values which are live into the function entry, so there's
+ // no instruction to emit.
+ break;
+ case WebAssembly::LOOP_END:
+ // This is a no-op which just exists to tell AsmPrinter.cpp that there's a
+ // fallthrough which nevertheless requires a label for the destination here.
+ break;
+ default: {
+ WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ break;
+ }
+ }
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ // First try the generic code, which knows about modifiers like 'c' and 'n'.
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+ return false;
+
+ if (!ExtraCode) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Immediate:
+ OS << MO.getImm();
+ return false;
+ case MachineOperand::MO_Register:
+ OS << regToString(MO);
+ return false;
+ case MachineOperand::MO_GlobalAddress:
+ getSymbol(MO.getGlobal())->print(OS, MAI);
+ printOffset(MO.getOffset(), OS);
+ return false;
+ case MachineOperand::MO_ExternalSymbol:
+ GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI);
+ printOffset(MO.getOffset(), OS);
+ return false;
+ case MachineOperand::MO_MachineBasicBlock:
+ MO.getMBB()->getSymbol()->print(OS, MAI);
+ return false;
+ default:
+ break;
+ }
+ }
+
+ return true;
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &OS) {
+ if (AsmVariant != 0)
+ report_fatal_error("There are no defined alternate asm variants");
+
+ if (!ExtraCode) {
+ // TODO: For now, we just hard-code 0 as the constant offset; teach
+ // SelectInlineAsmMemoryOperand how to do address mode matching.
+ OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
+ return false;
+ }
+
+ return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmPrinter() {
+ RegisterAsmPrinter<WebAssemblyAsmPrinter> X(TheWebAssemblyTarget32);
+ RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(TheWebAssemblyTarget64);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
new file mode 100644
index 000000000000..e9671ee07e69
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -0,0 +1,468 @@
+//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG stacking pass.
+///
+/// This pass reorders the blocks in a function to put them into a reverse
+/// post-order [0], with special care to keep the order as similar as possible
+/// to the original order, and to keep loops contiguous even in the case of
+/// split backedges.
+///
+/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// scope boundaries serve as the labels for WebAssembly's control transfers.
+///
+/// This is sufficient to convert arbitrary CFGs into a form that works on
+/// WebAssembly, provided that all loops are single-entry.
+///
+/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-stackify"
+
+namespace {
+class WebAssemblyCFGStackify final : public MachineFunctionPass {
+ const char *getPassName() const override {
+ return "WebAssembly CFG Stackify";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyCFGStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGStackify() {
+ return new WebAssemblyCFGStackify();
+}
+
+static void EliminateMultipleEntryLoops(MachineFunction &MF,
+ const MachineLoopInfo &MLI) {
+ SmallPtrSet<MachineBasicBlock *, 8> InSet;
+ for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF);
+ I != E; ++I) {
+ const std::vector<MachineBasicBlock *> &CurrentSCC = *I;
+
+ // Skip trivial SCCs.
+ if (CurrentSCC.size() == 1)
+ continue;
+
+ InSet.insert(CurrentSCC.begin(), CurrentSCC.end());
+ MachineBasicBlock *Header = nullptr;
+ for (MachineBasicBlock *MBB : CurrentSCC) {
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (InSet.count(Pred))
+ continue;
+ if (!Header) {
+ Header = MBB;
+ break;
+ }
+ // TODO: Implement multiple-entry loops.
+ report_fatal_error("multiple-entry loops are not supported yet");
+ }
+ }
+ assert(MLI.isLoopHeader(Header));
+
+ InSet.clear();
+ }
+}
+
+namespace {
+/// Post-order traversal stack entry.
+struct POStackEntry {
+ MachineBasicBlock *MBB;
+ SmallVector<MachineBasicBlock *, 0> Succs;
+
+ POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
+ const MachineLoopInfo &MLI);
+};
+} // end anonymous namespace
+
+static bool LoopContains(const MachineLoop *Loop,
+ const MachineBasicBlock *MBB) {
+ return Loop ? Loop->contains(MBB) : true;
+}
+
+POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
+ const MachineLoopInfo &MLI)
+ : MBB(MBB), Succs(MBB->successors()) {
+ // RPO is not a unique form, since at every basic block with multiple
+ // successors, the DFS has to pick which order to visit the successors in.
+ // Sort them strategically (see below).
+ MachineLoop *Loop = MLI.getLoopFor(MBB);
+ MachineFunction::iterator Next = next(MachineFunction::iterator(MBB));
+ MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next;
+ std::stable_sort(
+ Succs.begin(), Succs.end(),
+ [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+ if (A == B)
+ return false;
+
+ // Keep loops contiguous by preferring the block that's in the same
+ // loop.
+ bool LoopContainsA = LoopContains(Loop, A);
+ bool LoopContainsB = LoopContains(Loop, B);
+ if (LoopContainsA && !LoopContainsB)
+ return true;
+ if (!LoopContainsA && LoopContainsB)
+ return false;
+
+ // Minimize perturbation by preferring the block which is the immediate
+ // layout successor.
+ if (A == LayoutSucc)
+ return true;
+ if (B == LayoutSucc)
+ return false;
+
+ // TODO: More sophisticated orderings may be profitable here.
+
+ return false;
+ });
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
+ MachineBasicBlock *Bottom = Loop->getHeader();
+ for (MachineBasicBlock *MBB : Loop->blocks())
+ if (MBB->getNumber() > Bottom->getNumber())
+ Bottom = MBB;
+ return Bottom;
+}
+
+/// Sort the blocks in RPO, taking special care to make sure that loops are
+/// contiguous even in the case of split backedges.
+///
+/// TODO: Determine whether RPO is actually worthwhile, or whether we should
+/// move to just a stable-topological-sort-based approach that would preserve
+/// more of the original order.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
+ // Note that we do our own RPO rather than using
+ // "llvm/ADT/PostOrderIterator.h" because we want control over the order that
+ // successors are visited in (see above). Also, we can sort the blocks in the
+ // MachineFunction as we go.
+ SmallPtrSet<MachineBasicBlock *, 16> Visited;
+ SmallVector<POStackEntry, 16> Stack;
+
+ MachineBasicBlock *EntryBlock = &*MF.begin();
+ Visited.insert(EntryBlock);
+ Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
+
+ for (;;) {
+ POStackEntry &Entry = Stack.back();
+ SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs;
+ if (!Succs.empty()) {
+ MachineBasicBlock *Succ = Succs.pop_back_val();
+ if (Visited.insert(Succ).second)
+ Stack.push_back(POStackEntry(Succ, MF, MLI));
+ continue;
+ }
+
+ // Put the block in its position in the MachineFunction.
+ MachineBasicBlock &MBB = *Entry.MBB;
+ MBB.moveBefore(&*MF.begin());
+
+ // Branch instructions may utilize a fallthrough, so update them if a
+ // fallthrough has been added or removed.
+ if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() &&
+ !MBB.back().isBarrier())
+ report_fatal_error(
+ "Non-branch terminator with fallthrough cannot yet be rewritten");
+ if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch())
+ MBB.updateTerminator();
+
+ Stack.pop_back();
+ if (Stack.empty())
+ break;
+ }
+
+ // Now that we've sorted the blocks in RPO, renumber them.
+ MF.RenumberBlocks();
+
+#ifndef NDEBUG
+ SmallSetVector<MachineLoop *, 8> OnStack;
+
+ // Insert a sentinel representing the degenerate loop that starts at the
+ // function entry block and includes the entire function as a "loop" that
+ // executes once.
+ OnStack.insert(nullptr);
+
+ for (auto &MBB : MF) {
+ assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+ MachineLoop *Loop = MLI.getLoopFor(&MBB);
+ if (Loop && &MBB == Loop->getHeader()) {
+ // Loop header. The loop predecessor should be sorted above, and the other
+ // predecessors should be backedges below.
+ for (auto Pred : MBB.predecessors())
+ assert(
+ (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+ "Loop header predecessors must be loop predecessors or backedges");
+ assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+ } else {
+ // Not a loop header. All predecessors should be sorted above.
+ for (auto Pred : MBB.predecessors())
+ assert(Pred->getNumber() < MBB.getNumber() &&
+ "Non-loop-header predecessors should be topologically sorted");
+ assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+ "Blocks must be nested in their loops");
+ }
+ while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+ OnStack.pop_back();
+ }
+ assert(OnStack.pop_back_val() == nullptr &&
+ "The function entry block shouldn't actually be a loop header");
+ assert(OnStack.empty() &&
+ "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+/// Test whether Pred has any terminators explicitly branching to MBB, as
+/// opposed to falling through. Note that it's possible (eg. in unoptimized
+/// code) for a branch instruction to both branch to a block and fallthrough
+/// to it, so we check the actual branch operands to see if there are any
+/// explicit mentions.
+static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, MachineBasicBlock *MBB) {
+ for (MachineInstr &MI : Pred->terminators())
+ for (MachineOperand &MO : MI.explicit_operands())
+ if (MO.isMBB() && MO.getMBB() == MBB)
+ return true;
+ return false;
+}
+
+/// Insert a BLOCK marker for branches to MBB (if needed).
+static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
+ SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+ const WebAssemblyInstrInfo &TII,
+ const MachineLoopInfo &MLI,
+ MachineDominatorTree &MDT) {
+ // First compute the nearest common dominator of all forward non-fallthrough
+ // predecessors so that we minimize the time that the BLOCK is on the stack,
+ // which reduces overall stack height.
+ MachineBasicBlock *Header = nullptr;
+ bool IsBranchedTo = false;
+ int MBBNumber = MBB.getNumber();
+ for (MachineBasicBlock *Pred : MBB.predecessors())
+ if (Pred->getNumber() < MBBNumber) {
+ Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+ if (ExplicitlyBranchesTo(Pred, &MBB))
+ IsBranchedTo = true;
+ }
+ if (!Header)
+ return;
+ if (!IsBranchedTo)
+ return;
+
+ assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+ MachineBasicBlock *LayoutPred = &*prev(MachineFunction::iterator(&MBB));
+
+ // If the nearest common dominator is inside a more deeply nested context,
+ // walk out to the nearest scope which isn't more deeply nested.
+ for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+ if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+ if (ScopeTop->getNumber() > Header->getNumber()) {
+ // Skip over an intervening scope.
+ I = next(MachineFunction::iterator(ScopeTop));
+ } else {
+ // We found a scope level at an appropriate depth.
+ Header = ScopeTop;
+ break;
+ }
+ }
+ }
+
+ // If there's a loop which ends just before MBB which contains Header, we can
+ // reuse its label instead of inserting a new BLOCK.
+ for (MachineLoop *Loop = MLI.getLoopFor(LayoutPred);
+ Loop && Loop->contains(LayoutPred); Loop = Loop->getParentLoop())
+ if (Loop && LoopBottom(Loop) == LayoutPred && Loop->contains(Header))
+ return;
+
+ // Decide where in Header to put the BLOCK.
+ MachineBasicBlock::iterator InsertPos;
+ MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
+ if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+ // Header is the header of a loop that does not lexically contain MBB, so
+ // the BLOCK needs to be above the LOOP.
+ InsertPos = Header->begin();
+ } else {
+ // Otherwise, insert the BLOCK as late in Header as we can, but before the
+ // beginning of the local expression tree and any nested BLOCKs.
+ InsertPos = Header->getFirstTerminator();
+ while (InsertPos != Header->begin() &&
+ prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) &&
+ prev(InsertPos)->getOpcode() != WebAssembly::LOOP)
+ --InsertPos;
+ }
+
+ // Add the BLOCK.
+ BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK))
+ .addMBB(&MBB);
+
+ // Track the farthest-spanning scope that ends at this point.
+ int Number = MBB.getNumber();
+ if (!ScopeTops[Number] ||
+ ScopeTops[Number]->getNumber() > Header->getNumber())
+ ScopeTops[Number] = Header;
+}
+
+/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
+static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF,
+ SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+ const WebAssemblyInstrInfo &TII,
+ const MachineLoopInfo &MLI) {
+ MachineLoop *Loop = MLI.getLoopFor(&MBB);
+ if (!Loop || Loop->getHeader() != &MBB)
+ return;
+
+ // The operand of a LOOP is the first block after the loop. If the loop is the
+ // bottom of the function, insert a dummy block at the end.
+ MachineBasicBlock *Bottom = LoopBottom(Loop);
+ auto Iter = next(MachineFunction::iterator(Bottom));
+ if (Iter == MF.end()) {
+ MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
+ // Give it a fake predecessor so that AsmPrinter prints its label.
+ Label->addSuccessor(Label);
+ MF.push_back(Label);
+ Iter = next(MachineFunction::iterator(Bottom));
+ }
+ MachineBasicBlock *AfterLoop = &*Iter;
+ BuildMI(MBB, MBB.begin(), DebugLoc(), TII.get(WebAssembly::LOOP))
+ .addMBB(AfterLoop);
+
+ // Emit a special no-op telling the asm printer that we need a label to close
+ // the loop scope, even though the destination is only reachable by
+ // fallthrough.
+ if (!Bottom->back().isBarrier())
+ BuildMI(*Bottom, Bottom->end(), DebugLoc(), TII.get(WebAssembly::LOOP_END));
+
+ assert((!ScopeTops[AfterLoop->getNumber()] ||
+ ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
+ "With RPO we should visit the outer-most loop for a block first.");
+ if (!ScopeTops[AfterLoop->getNumber()])
+ ScopeTops[AfterLoop->getNumber()] = &MBB;
+}
+
+/// Insert LOOP and BLOCK markers at appropriate places.
+static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
+ const WebAssemblyInstrInfo &TII,
+ MachineDominatorTree &MDT) {
+ // For each block whose label represents the end of a scope, record the block
+ // which holds the beginning of the scope. This will allow us to quickly skip
+ // over scoped regions when walking blocks. We allocate one more than the
+ // number of blocks in the function to accommodate for the possible fake block
+ // we may insert at the end.
+ SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1);
+
+ for (auto &MBB : MF) {
+ // Place the LOOP for MBB if MBB is the header of a loop.
+ PlaceLoopMarker(MBB, MF, ScopeTops, TII, MLI);
+
+ // Place the BLOCK for MBB if MBB is branched to from above.
+ PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT);
+ }
+}
+
+#ifndef NDEBUG
+static bool
+IsOnStack(const SmallVectorImpl<std::pair<MachineBasicBlock *, bool>> &Stack,
+ const MachineBasicBlock *MBB) {
+ for (const auto &Pair : Stack)
+ if (Pair.first == MBB)
+ return true;
+ return false;
+}
+#endif
+
+bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ const auto &MLI = getAnalysis<MachineLoopInfo>();
+ auto &MDT = getAnalysis<MachineDominatorTree>();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+ // RPO sorting needs all loops to be single-entry.
+ EliminateMultipleEntryLoops(MF, MLI);
+
+ // Sort the blocks in RPO, with contiguous loops.
+ SortBlocks(MF, MLI);
+
+ // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
+ PlaceMarkers(MF, MLI, TII, MDT);
+
+#ifndef NDEBUG
+ // Verify that block and loop beginnings and endings are in LIFO order, and
+ // that all references to blocks are to blocks on the stack at the point of
+ // the reference.
+ SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack;
+ for (auto &MBB : MF) {
+ while (!Stack.empty() && Stack.back().first == &MBB)
+ if (Stack.back().second) {
+ assert(Stack.size() >= 2);
+ Stack.pop_back();
+ Stack.pop_back();
+ } else {
+ assert(Stack.size() >= 1);
+ Stack.pop_back();
+ }
+ for (auto &MI : MBB)
+ switch (MI.getOpcode()) {
+ case WebAssembly::LOOP:
+ Stack.push_back(std::make_pair(&MBB, false));
+ Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), true));
+ break;
+ case WebAssembly::BLOCK:
+ Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), false));
+ break;
+ default:
+ // Verify that all referenced blocks are in scope. A reference to a
+ // block with a negative number is invalid, but can happen with inline
+ // asm, so we shouldn't assert on it, but instead let CodeGen properly
+ // fail on it.
+ for (const MachineOperand &MO : MI.explicit_operands())
+ if (MO.isMBB() && MO.getMBB()->getNumber() >= 0)
+ assert(IsOnStack(Stack, MO.getMBB()));
+ break;
+ }
+ }
+ assert(Stack.empty());
+#endif
+
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
new file mode 100644
index 000000000000..1b761b1a9d73
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -0,0 +1,81 @@
+//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// class. Some of the target-specific code is generated by tablegen in the file
+/// WebAssemblyGenFastISel.inc, which is #included here.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fastisel"
+
+namespace {
+
+class WebAssemblyFastISel final : public FastISel {
+ /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+ /// right decision when generating code for different targets.
+ const WebAssemblySubtarget *Subtarget;
+ LLVMContext *Context;
+
+ // Call handling routines.
+private:
+public:
+ // Backend specific FastISel code.
+ WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo)
+ : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+ Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>();
+ Context = &FuncInfo.Fn->getContext();
+ }
+
+ bool fastSelectInstruction(const Instruction *I) override;
+
+#include "WebAssemblyGenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default:
+ break;
+ // TODO: add fast-isel selection cases here...
+ }
+
+ // Fall back to target-independent instruction selection.
+ return selectOperator(I, I->getOpcode());
+}
+
+FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) {
+ return new WebAssemblyFastISel(FuncInfo, LibInfo);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index e4ca82e963c2..0eefd57f1f2c 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -35,11 +35,20 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-frame-info"
// TODO: Implement a red zone?
+// TODO: wasm64
+// TODO: Prolog/epilog should be stackified too. This pass runs after register
+// stackification, so we'll have to do it manually.
+// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
/// Return true if the specified function should have a dedicated frame pointer
/// register.
bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
- llvm_unreachable("TODO: implement hasFP");
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const auto *RegInfo =
+ MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+ return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+ MFI->hasStackMap() || MFI->hasPatchPoint() ||
+ RegInfo->needsStackRealignment(MF);
}
/// Under normal circumstances, when a frame pointer is not required, we reserve
@@ -52,23 +61,115 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame(
return !MF.getFrameInfo()->hasVarSizedObjects();
}
+
+/// Adjust the stack pointer by a constant amount.
+static void adjustStackPointer(unsigned StackSize,
+ bool AdjustUp,
+ MachineFunction& MF,
+ MachineBasicBlock& MBB,
+ const TargetInstrInfo* TII,
+ MachineBasicBlock::iterator InsertPt,
+ const DebugLoc& DL) {
+ auto &MRI = MF.getRegInfo();
+ unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg)
+ .addExternalSymbol(SPSymbol);
+ // This MachinePointerInfo should reference __stack_pointer as well but
+ // doesn't because MachinePointerInfo() takes a GV which we don't have for
+ // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry
+ // is appropriate instead. (likewise for EmitEpologue below)
+ auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOLoad, 4, 4);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+ .addImm(0)
+ .addReg(SPReg)
+ .addMemOperand(LoadMMO);
+ // Add/Subtract the frame size
+ unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addImm(StackSize);
+ BuildMI(MBB, InsertPt, DL,
+ TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32),
+ WebAssembly::SP32)
+ .addReg(SPReg)
+ .addReg(OffsetReg);
+ // The SP32 register now has the new stacktop. Also write it back to memory.
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addExternalSymbol(SPSymbol);
+ auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOStore, 4, 4);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addReg(WebAssembly::SP32)
+ .addMemOperand(MMO);
+}
+
void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
- llvm_unreachable("TODO: implement eliminateCallFramePseudoInstr");
+ const auto *TII =
+ static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo());
+ DebugLoc DL = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ unsigned Amount = I->getOperand(0).getImm();
+ if (Amount)
+ adjustStackPointer(Amount, IsDestroy, MF, MBB,
+ TII, I, DL);
+ MBB.erase(I);
}
void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- llvm_unreachable("TODO: implement emitPrologue");
+ // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions
+ auto *MFI = MF.getFrameInfo();
+ assert(MFI->getCalleeSavedInfo().empty() &&
+ "WebAssembly should not have callee-saved registers");
+ assert(!hasFP(MF) && "Functions needing frame pointers not yet supported");
+ uint64_t StackSize = MFI->getStackSize();
+ if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0))
+ return;
+
+ const auto *TII = MF.getSubtarget().getInstrInfo();
+
+ auto InsertPt = MBB.begin();
+ DebugLoc DL;
+
+ adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL);
}
void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- llvm_unreachable("TODO: implement emitEpilogue");
-}
+ uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+ if (!StackSize)
+ return;
+ const auto *TII = MF.getSubtarget().getInstrInfo();
+ auto &MRI = MF.getRegInfo();
+ unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ auto InsertPt = MBB.getFirstTerminator();
+ DebugLoc DL;
+
+ if (InsertPt != MBB.end()) {
+ DL = InsertPt->getDebugLoc();
+ }
-void WebAssemblyFrameLowering::processFunctionBeforeCalleeSavedScan(
- MachineFunction &MF, RegScavenger *RS) const {
- llvm_unreachable("TODO: implement processFunctionBeforeCalleeSavedScan");
+ // Restore the stack pointer. Without FP its value is just SP32 - stacksize
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addImm(StackSize);
+ auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32)
+ .addReg(WebAssembly::SP32)
+ .addReg(OffsetReg);
+ // Re-use OffsetReg to hold the address of the stacktop
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addExternalSymbol(SPSymbol);
+ auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOStore, 4, 4);
+ BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addReg(WebAssembly::SP32)
+ .addMemOperand(MMO);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 0b112d02c0bf..5f4708fe77ed 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -38,9 +38,6 @@ public:
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
-
- void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
new file mode 100644
index 000000000000..3a03fa55b220
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -0,0 +1,25 @@
+//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the various WebAssembly ISD node types.
+///
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+HANDLE_NODETYPE(CALL1)
+HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(RETURN)
+HANDLE_NODETYPE(ARGUMENT)
+HANDLE_NODETYPE(Wrapper)
+HANDLE_NODETYPE(BR_IF)
+HANDLE_NODETYPE(TABLESWITCH)
+
+// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 518ef332a6c7..8390f797c43e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -56,13 +56,68 @@ public:
SDNode *Select(SDNode *Node) override;
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+ std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "WebAssemblyGenDAGISel.inc"
+
private:
// add select functions here...
};
} // end anonymous namespace
SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
- llvm_unreachable("TODO: implement Select");
+ // Dump information about the Node being selected.
+ DEBUG(errs() << "Selecting: ");
+ DEBUG(Node->dump(CurDAG));
+ DEBUG(errs() << "\n");
+
+ // If we have a custom node, we already have selected!
+ if (Node->isMachineOpcode()) {
+ DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+ Node->setNodeId(-1);
+ return nullptr;
+ }
+
+ // Few custom selection stuff.
+ SDNode *ResNode = nullptr;
+ EVT VT = Node->getValueType(0);
+
+ switch (Node->getOpcode()) {
+ default:
+ break;
+ // If we need WebAssembly-specific selection, it would go here.
+ (void)VT;
+ }
+
+ // Select the default instruction.
+ ResNode = SelectCode(Node);
+
+ DEBUG(errs() << "=> ");
+ if (ResNode == nullptr || ResNode == Node)
+ DEBUG(Node->dump(CurDAG));
+ else
+ DEBUG(ResNode->dump(CurDAG));
+ DEBUG(errs() << "\n");
+
+ return ResNode;
+}
+
+bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+ switch (ConstraintID) {
+ case InlineAsm::Constraint_i:
+ case InlineAsm::Constraint_m:
+ // We just support simple memory operands that just have a single address
+ // operand and need no special handling.
+ OutOps.push_back(Op);
+ return false;
+ default:
+ break;
+ }
+
+ return true;
}
/// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4184eb6dc5a6..7a89f788c1ad 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -17,10 +17,13 @@
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
-#include "WebAssemblyTargetObjectFile.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/CommandLine.h"
@@ -32,14 +35,254 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-lower"
+namespace {
+// Diagnostic information for unimplemented or unsupported feature reporting.
+// TODO: This code is copied from BPF and AMDGPU; consider factoring it out
+// and sharing code.
+class DiagnosticInfoUnsupported final : public DiagnosticInfo {
+private:
+ // Debug location where this diagnostic is triggered.
+ DebugLoc DLoc;
+ const Twine &Description;
+ const Function &Fn;
+ SDValue Value;
+
+ static int KindID;
+
+ static int getKindID() {
+ if (KindID == 0)
+ KindID = llvm::getNextAvailablePluginDiagnosticKind();
+ return KindID;
+ }
+
+public:
+ DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
+ SDValue Value)
+ : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
+ Description(Desc), Fn(Fn), Value(Value) {}
+
+ void print(DiagnosticPrinter &DP) const override {
+ std::string Str;
+ raw_string_ostream OS(Str);
+
+ if (DLoc) {
+ auto DIL = DLoc.get();
+ StringRef Filename = DIL->getFilename();
+ unsigned Line = DIL->getLine();
+ unsigned Column = DIL->getColumn();
+ OS << Filename << ':' << Line << ':' << Column << ' ';
+ }
+
+ OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
+ << Description;
+ if (Value)
+ Value->print(OS);
+ OS << '\n';
+ OS.flush();
+ DP << Str;
+ }
+
+ static bool classof(const DiagnosticInfo *DI) {
+ return DI->getKind() == getKindID();
+ }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+} // end anonymous namespace
+
WebAssemblyTargetLowering::WebAssemblyTargetLowering(
const TargetMachine &TM, const WebAssemblySubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
+ auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+
+ // Booleans always contain 0 or 1.
+ setBooleanContents(ZeroOrOneBooleanContent);
// WebAssembly does not produce floating-point exceptions on normal floating
// point operations.
setHasFloatingPointExceptions(false);
// We don't know the microarchitecture here, so just reduce register pressure.
setSchedulingPreference(Sched::RegPressure);
+ // Tell ISel that we have a stack pointer.
+ setStackPointerRegisterToSaveRestore(
+ Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32);
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &WebAssembly::I32RegClass);
+ addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
+ addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
+ addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+ // Compute derived properties from the register classes.
+ computeRegisterProperties(Subtarget->getRegisterInfo());
+
+ setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+ setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
+ setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+
+ // Take the default expansion for va_arg, va_copy, and va_end. There is no
+ // default action for va_start, so we do that custom.
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+ for (auto T : {MVT::f32, MVT::f64}) {
+ // Don't expand the floating-point types to constant pools.
+ setOperationAction(ISD::ConstantFP, T, Legal);
+ // Expand floating-point comparisons.
+ for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE,
+ ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
+ setCondCodeAction(CC, T, Expand);
+ // Expand floating-point library function operators.
+ for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+ ISD::FREM, ISD::FMA})
+ setOperationAction(Op, T, Expand);
+ // Note supported floating-point library function operators that otherwise
+ // default to expand.
+ for (auto Op :
+ {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
+ setOperationAction(Op, T, Legal);
+ // Support minnan and maxnan, which otherwise default to expand.
+ setOperationAction(ISD::FMINNAN, T, Legal);
+ setOperationAction(ISD::FMAXNAN, T, Legal);
+ }
+
+ for (auto T : {MVT::i32, MVT::i64}) {
+ // Expand unavailable integer operations.
+ for (auto Op :
+ {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+ ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
+ ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
+ ISD::SUBE}) {
+ setOperationAction(Op, T, Expand);
+ }
+ }
+
+ // As a special case, these operators use the type to mean the type to
+ // sign-extend from.
+ for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
+ setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
+
+ // Dynamic stack allocation: use the default expansion.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
+
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+ // Expand these forms; we pattern-match the forms that we can handle in isel.
+ for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+ for (auto Op : {ISD::BR_CC, ISD::SELECT_CC})
+ setOperationAction(Op, T, Expand);
+
+ // We have custom switch handling.
+ setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+ // WebAssembly doesn't have:
+ // - Floating-point extending loads.
+ // - Floating-point truncating stores.
+ // - i1 extending loads.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ for (auto T : MVT::integer_valuetypes())
+ for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+ setLoadExtAction(Ext, T, MVT::i1, Promote);
+
+ // Trap lowers to wasm unreachable
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+}
+
+FastISel *WebAssemblyTargetLowering::createFastISel(
+ FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const {
+ return WebAssembly::createFastISel(FuncInfo, LibInfo);
+}
+
+bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode * /*GA*/) const {
+ // All offsets can be folded.
+ return true;
+}
+
+MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
+ EVT VT) const {
+ unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+ if (BitWidth > 1 && BitWidth < 8)
+ BitWidth = 8;
+
+ if (BitWidth > 64) {
+ BitWidth = 64;
+ assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
+ "64-bit shift counts ought to be enough for anyone");
+ }
+
+ MVT Result = MVT::getIntegerVT(BitWidth);
+ assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+ "Unable to represent scalar shift amount type");
+ return Result;
+}
+
+const char *
+WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
+ case WebAssemblyISD::FIRST_NUMBER:
+ break;
+#define HANDLE_NODETYPE(NODE) \
+ case WebAssemblyISD::NODE: \
+ return "WebAssemblyISD::" #NODE;
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+ }
+ return nullptr;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
+ const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to a
+ // WebAssembly register class.
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'r':
+ assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+ if (VT.isInteger() && !VT.isVector()) {
+ if (VT.getSizeInBits() <= 32)
+ return std::make_pair(0U, &WebAssembly::I32RegClass);
+ if (VT.getSizeInBits() <= 64)
+ return std::make_pair(0U, &WebAssembly::I64RegClass);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+ // Assume ctz is a relatively cheap operation.
+ return true;
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+ // Assume clz is a relatively cheap operation.
+ return true;
+}
+
+bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM,
+ Type *Ty,
+ unsigned AS) const {
+ // WebAssembly offsets are added as unsigned without wrapping. The
+ // isLegalAddressingMode gives us no way to determine if wrapping could be
+ // happening, so we approximate this by accepting only non-negative offsets.
+ if (AM.BaseOffs < 0)
+ return false;
+
+ // WebAssembly has no scale register operands.
+ if (AM.Scale != 0)
+ return false;
+
+ // Everything else is legal.
+ return true;
}
//===----------------------------------------------------------------------===//
@@ -50,16 +293,359 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Lowering Code
//===----------------------------------------------------------------------===//
+static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
+}
+
+// Test whether the given calling convention is supported.
+static bool CallingConvSupported(CallingConv::ID CallConv) {
+ // We currently support the language-independent target-independent
+ // conventions. We don't yet have a way to annotate calls with properties like
+ // "cold", and we don't have any call-clobbered registers, so these are mostly
+ // all handled the same.
+ return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+ CallConv == CallingConv::Cold ||
+ CallConv == CallingConv::PreserveMost ||
+ CallConv == CallingConv::PreserveAll ||
+ CallConv == CallingConv::CXX_FAST_TLS;
+}
+
+SDValue
+WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc DL = CLI.DL;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ CallingConv::ID CallConv = CLI.CallConv;
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG,
+ "WebAssembly doesn't support language-specific or target-specific "
+ "calling conventions yet");
+ if (CLI.IsPatchPoint)
+ fail(DL, DAG, "WebAssembly doesn't support patch point yet");
+
+ // WebAssembly doesn't currently support explicit tail calls. If they are
+ // required, fail. Otherwise, just disable them.
+ if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
+ MF.getTarget().Options.GuaranteedTailCallOpt) ||
+ (CLI.CS && CLI.CS->isMustTailCall()))
+ fail(DL, DAG, "WebAssembly doesn't support tail call yet");
+ CLI.IsTailCall = false;
+
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ if (Ins.size() > 1)
+ fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
+
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ for (const ISD::OutputArg &Out : Outs) {
+ if (Out.Flags.isByVal())
+ fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+ if (Out.Flags.isNest())
+ fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+ if (Out.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+ if (Out.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+ if (Out.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+ }
+
+ bool IsVarArg = CLI.IsVarArg;
+ unsigned NumFixedArgs = CLI.NumFixedArgs;
+ auto PtrVT = getPointerTy(MF.getDataLayout());
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ if (IsVarArg) {
+ // Outgoing non-fixed arguments are placed at the top of the stack. First
+ // compute their offsets and the total amount of argument stack space
+ // needed.
+ for (SDValue Arg :
+ make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ EVT VT = Arg.getValueType();
+ assert(VT != MVT::iPTR && "Legalized args should be concrete");
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+ unsigned Offset =
+ CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
+ MF.getDataLayout().getABITypeAlignment(Ty));
+ CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+ Offset, VT.getSimpleVT(),
+ CCValAssign::Full));
+ }
+ }
+
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+ SDValue NB;
+ if (NumBytes) {
+ NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
+ Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
+ }
+
+ if (IsVarArg) {
+ // For non-fixed arguments, next emit stores to store the argument values
+ // to the stack at the offsets computed above.
+ SDValue SP = DAG.getCopyFromReg(
+ Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+ unsigned ValNo = 0;
+ SmallVector<SDValue, 8> Chains;
+ for (SDValue Arg :
+ make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ assert(ArgLocs[ValNo].getValNo() == ValNo &&
+ "ArgLocs should remain in order and only hold varargs args");
+ unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+ SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+ DAG.getConstant(Offset, DL, PtrVT));
+ Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
+ MachinePointerInfo::getStack(MF, Offset),
+ false, false, 0));
+ }
+ if (!Chains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ }
+
+ // Compute the operands for the CALLn node.
+ SmallVector<SDValue, 16> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+ // isn't reliable.
+ Ops.append(OutVals.begin(),
+ IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+
+ SmallVector<EVT, 8> Tys;
+ for (const auto &In : Ins) {
+ assert(!In.Flags.isByVal() && "byval is not valid for return values");
+ assert(!In.Flags.isNest() && "nest is not valid for return values");
+ if (In.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
+ if (In.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
+ if (In.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG,
+ "WebAssembly hasn't implemented cons regs last return values");
+ // Ignore In.getOrigAlign() because all our arguments are passed in
+ // registers.
+ Tys.push_back(In.VT);
+ }
+ Tys.push_back(MVT::Other);
+ SDVTList TyList = DAG.getVTList(Tys);
+ SDValue Res =
+ DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
+ DL, TyList, Ops);
+ if (Ins.empty()) {
+ Chain = Res;
+ } else {
+ InVals.push_back(Res);
+ Chain = Res.getValue(1);
+ }
+
+ if (NumBytes) {
+ SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT);
+ Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
+ }
+
+ return Chain;
+}
+
+bool WebAssemblyTargetLowering::CanLowerReturn(
+ CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext & /*Context*/) const {
+ // WebAssembly can't currently handle returning tuples.
+ return Outs.size() <= 1;
+}
+
+SDValue WebAssemblyTargetLowering::LowerReturn(
+ SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+ SelectionDAG &DAG) const {
+ assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ RetOps.append(OutVals.begin(), OutVals.end());
+ Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps);
+
+ // Record the number and types of the return values.
+ for (const ISD::OutputArg &Out : Outs) {
+ assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+ assert(!Out.Flags.isNest() && "nest is not valid for return values");
+ assert(Out.IsFixed && "non-fixed return value is not valid");
+ if (Out.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+ if (Out.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+ if (Out.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+ }
+
+ return Chain;
+}
+
+SDValue WebAssemblyTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (!CallingConvSupported(CallConv))
+ fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+ // Set up the incoming ARGUMENTS value, which serves to represent the liveness
+ // of the incoming values before they're represented by virtual registers.
+ MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
+
+ for (const ISD::InputArg &In : Ins) {
+ if (In.Flags.isByVal())
+ fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+ if (In.Flags.isInAlloca())
+ fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+ if (In.Flags.isNest())
+ fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+ if (In.Flags.isInConsecutiveRegs())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+ if (In.Flags.isInConsecutiveRegsLast())
+ fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+ // Ignore In.getOrigAlign() because all our arguments are passed in
+ // registers.
+ InVals.push_back(
+ In.Used
+ ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
+ DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
+ : DAG.getUNDEF(In.VT));
+
+ // Record the number and types of arguments.
+ MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
+ }
+
+ // Incoming varargs arguments are on the stack and will be accessed through
+ // va_arg, so we don't need to do anything for them here.
+
+ return Chain;
+}
+
//===----------------------------------------------------------------------===//
-// Other Lowering Code
+// Custom lowering hooks.
//===----------------------------------------------------------------------===//
+SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
+ SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("unimplemented operation lowering");
+ return SDValue();
+ case ISD::FrameIndex:
+ return LowerFrameIndex(Op, DAG);
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::ExternalSymbol:
+ return LowerExternalSymbol(Op, DAG);
+ case ISD::JumpTable:
+ return LowerJumpTable(Op, DAG);
+ case ISD::BR_JT:
+ return LowerBR_JT(Op, DAG);
+ case ISD::VASTART:
+ return LowerVASTART(Op, DAG);
+ }
+}
+
+SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
+ SelectionDAG &DAG) const {
+ int FI = cast<FrameIndexSDNode>(Op)->getIndex();
+ return DAG.getTargetFrameIndex(FI, Op.getValueType());
+}
+
+SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const auto *GA = cast<GlobalAddressSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+ if (GA->getAddressSpace() != 0)
+ fail(DL, DAG, "WebAssembly only expects the 0 address space");
+ return DAG.getNode(
+ WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
+}
+
+SDValue
+WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const auto *ES = cast<ExternalSymbolSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(ES->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+ return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+ DAG.getTargetExternalSymbol(ES->getSymbol(), VT));
+}
+
+SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ // There's no need for a Wrapper node because we always incorporate a jump
+ // table operand into a TABLESWITCH instruction, rather than ever
+ // materializing it in a register.
+ const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
+ JT->getTargetFlags());
+}
+
+SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Chain = Op.getOperand(0);
+ const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
+ SDValue Index = Op.getOperand(2);
+ assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Index);
+
+ MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
+ const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
+
+ // TODO: For now, we just pick something arbitrary for a default case for now.
+ // We really want to sniff out the guard and put in the real default case (and
+ // delete the guard).
+ Ops.push_back(DAG.getBasicBlock(MBBs[0]));
+
+ // Add an operand for each case.
+ for (auto MBB : MBBs)
+ Ops.push_back(DAG.getBasicBlock(MBB));
+
+ return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
+}
+
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+ // The incoming non-fixed arguments are placed on the top of the stack, with
+ // natural alignment, at the point of the call, so the base pointer is just
+ // the current frame pointer.
+ DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+ unsigned FP =
+ Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
+ MachinePointerInfo(SV), false, false, 0);
+}
+
//===----------------------------------------------------------------------===//
// WebAssembly Optimization Hooks
//===----------------------------------------------------------------------===//
-
-MCSection *WebAssemblyTargetObjectFile::SelectSectionForGlobal(
- const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
- const TargetMachine &TM) const {
- return getDataSection();
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index efd60a7bacd6..e7232a042e12 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -22,10 +22,11 @@ namespace llvm {
namespace WebAssemblyISD {
-enum {
+enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
- // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
+#define HANDLE_NODETYPE(NODE) NODE,
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
};
} // end namespace WebAssemblyISD
@@ -42,8 +43,51 @@ private:
/// Keep a pointer to the WebAssemblySubtarget around so that we can make the
/// right decision when generating code for different targets.
const WebAssemblySubtarget *Subtarget;
+
+ FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) const override;
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+ const char *getTargetNodeName(unsigned Opcode) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS) const override;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+ SelectionDAG &DAG) const override;
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SDLoc DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ // Custom lowering hooks.
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
};
+namespace WebAssembly {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+} // end namespace WebAssembly
+
} // end namespace llvm
#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6b5b6cd54173..cfa1519e6d99 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -12,10 +12,63 @@
///
//===----------------------------------------------------------------------===//
-/*
- * TODO(jfb): Add the following.
- *
- * call_direct: call function directly
- * call_indirect: call function indirectly
- * addressof: obtain a function pointer value for a given function
- */
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
+let Defs = [ARGUMENTS] in {
+
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
+let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
+ [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+} // isCodeGenOnly = 1
+
+multiclass CALL<WebAssemblyRegClass vt, string prefix> {
+ def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops),
+ [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+ !strconcat(prefix, "call\t$dst, $callee")>;
+ def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+ [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+ !strconcat(prefix, "call_indirect\t$dst, $callee")>;
+}
+let Uses = [SP32, SP64], isCall = 1 in {
+ defm : CALL<I32, "i32.">;
+ defm : CALL<I64, "i64.">;
+ defm : CALL<F32, "f32.">;
+ defm : CALL<F64, "f64.">;
+
+ def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
+ [(WebAssemblycall0 (i32 imm:$callee))],
+ "call \t$callee">;
+ def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+ [(WebAssemblycall0 I32:$callee)],
+ "call_indirect\t$callee">;
+} // Uses = [SP32,SP64], isCall = 1
+
+} // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+ (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+ (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+ (CALL_F64 texternalsym:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+ (CALL_VOID texternalsym:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
new file mode 100644
index 000000000000..05efe8903413
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -0,0 +1,82 @@
+//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*-
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly control-flow code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+ [(brcond I32:$cond, bb:$dst)],
+ "br_if \t$cond, $dst">;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
+ "br_unless\t$cond, $dst">;
+let isBarrier = 1 in {
+def BR : I<(outs), (ins bb_op:$dst),
+ [(br bb:$dst)],
+ "br \t$dst">;
+} // isBarrier = 1
+} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+ (BR_IF I32:$cond, bb_op:$dst)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+ (BR_UNLESS I32:$cond, bb_op:$dst)>;
+
+let Defs = [ARGUMENTS] in {
+
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
+// currently.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
+ [(WebAssemblytableswitch I32:$index, bb:$default)],
+ "tableswitch\t$index, $default">;
+def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
+ [(WebAssemblytableswitch I64:$index, bb:$default)],
+ "tableswitch\t$index, $default">;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+// Placemarkers to indicate the start of a block or loop scope. These
+// use/clobber EXPR_STACK to prevent them from being moved into the middle of
+// an expression tree.
+let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in {
+def BLOCK : I<(outs), (ins bb_op:$dst), [], "block \t$dst">;
+def LOOP : I<(outs), (ins bb_op:$dst), [], "loop \t$dst">;
+} // Uses = [EXPR_STACK], Defs = [EXPR_STACK]
+
+// No-op to indicate to the AsmPrinter that a loop ends here, so a
+// basic block label is needed even if it wouldn't otherwise appear so.
+let isTerminator = 1, hasCtrlDep = 1 in
+def LOOP_END : I<(outs), (ins), []>;
+
+multiclass RETURN<WebAssemblyRegClass vt> {
+ def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
+ "return \t$val">;
+}
+
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+let isReturn = 1 in {
+ defm : RETURN<I32>;
+ defm : RETURN<I64>;
+ defm : RETURN<F32>;
+ defm : RETURN<F64>;
+ def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;
+} // isReturn = 1
+ def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">;
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 3fa29061b1de..931f4a913d0f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,32 +13,99 @@
///
//===----------------------------------------------------------------------===//
-/*
- * TODO(jfb): Add the following.
- *
- * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer
- * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer
- * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer
- * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer
- * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer
- * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer
- * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer
- * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer
- * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer
- * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer
- * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer
- * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer
- * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer
- * float32.demote[float64]: demote a 64-bit float to a 32-bit float
- * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float
- * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float
- * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float
- * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float
- * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float
- * float64.promote[float32]: promote a 32-bit float to a 64-bit float
- * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float
- * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float
- * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float
- * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float
- * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float
- */
+let Defs = [ARGUMENTS] in {
+
+def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+ [(set I32:$dst, (trunc I64:$src))],
+ "i32.wrap/i64\t$dst, $src">;
+
+def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+ [(set I64:$dst, (sext I32:$src))],
+ "i64.extend_s/i32\t$dst, $src">;
+def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
+ [(set I64:$dst, (zext I32:$src))],
+ "i64.extend_u/i32\t$dst, $src">;
+
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Conversion from floating point to integer traps on overflow and invalid.
+let hasSideEffects = 1 in {
+def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (fp_to_sint F32:$src))],
+ "i32.trunc_s/f32\t$dst, $src">;
+def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (fp_to_uint F32:$src))],
+ "i32.trunc_u/f32\t$dst, $src">;
+def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
+ [(set I64:$dst, (fp_to_sint F32:$src))],
+ "i64.trunc_s/f32\t$dst, $src">;
+def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
+ [(set I64:$dst, (fp_to_uint F32:$src))],
+ "i64.trunc_u/f32\t$dst, $src">;
+def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
+ [(set I32:$dst, (fp_to_sint F64:$src))],
+ "i32.trunc_s/f64\t$dst, $src">;
+def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
+ [(set I32:$dst, (fp_to_uint F64:$src))],
+ "i32.trunc_u/f64\t$dst, $src">;
+def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (fp_to_sint F64:$src))],
+ "i64.trunc_s/f64\t$dst, $src">;
+def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (fp_to_uint F64:$src))],
+ "i64.trunc_u/f64\t$dst, $src">;
+} // hasSideEffects = 1
+
+def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (sint_to_fp I32:$src))],
+ "f32.convert_s/i32\t$dst, $src">;
+def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (uint_to_fp I32:$src))],
+ "f32.convert_u/i32\t$dst, $src">;
+def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
+ [(set F64:$dst, (sint_to_fp I32:$src))],
+ "f64.convert_s/i32\t$dst, $src">;
+def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
+ [(set F64:$dst, (uint_to_fp I32:$src))],
+ "f64.convert_u/i32\t$dst, $src">;
+def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
+ [(set F32:$dst, (sint_to_fp I64:$src))],
+ "f32.convert_s/i64\t$dst, $src">;
+def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
+ [(set F32:$dst, (uint_to_fp I64:$src))],
+ "f32.convert_u/i64\t$dst, $src">;
+def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (sint_to_fp I64:$src))],
+ "f64.convert_s/i64\t$dst, $src">;
+def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (uint_to_fp I64:$src))],
+ "f64.convert_u/i64\t$dst, $src">;
+
+def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
+ [(set F64:$dst, (fextend F32:$src))],
+ "f64.promote/f32\t$dst, $src">;
+def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
+ [(set F32:$dst, (fround F64:$src))],
+ "f32.demote/f64\t$dst, $src">;
+
+def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
+ [(set I32:$dst, (bitconvert F32:$src))],
+ "i32.reinterpret/f32\t$dst, $src">;
+def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
+ [(set F32:$dst, (bitconvert I32:$src))],
+ "f32.reinterpret/i32\t$dst, $src">;
+def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
+ [(set I64:$dst, (bitconvert F64:$src))],
+ "i64.reinterpret/f64\t$dst, $src">;
+def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
+ [(set F64:$dst, (bitconvert I64:$src))],
+ "f64.reinterpret/i64\t$dst, $src">;
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 30ef6339d65a..5520c6de6732 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -12,33 +12,90 @@
///
//===----------------------------------------------------------------------===//
-defm FADD : BinaryFP<fadd>;
-defm FSUB : BinaryFP<fsub>;
-defm FMUL : BinaryFP<fmul>;
-defm FDIV : BinaryFP<fdiv>;
-defm FABS : UnaryFP<fabs>;
-defm FNEG : UnaryFP<fneg>;
-defm COPYSIGN : BinaryFP<fcopysign>;
-defm CEIL : UnaryFP<fceil>;
-defm FLOOR : UnaryFP<ffloor>;
-defm TRUNC : UnaryFP<ftrunc>;
-defm NEARESTINT : UnaryFP<fnearbyint>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * float32.eq: compare equal
- * float32.lt: less than
- * float32.le: less than or equal
- * float32.gt: greater than
- * float32.ge: greater than or equal
- */
-
-defm SQRT : UnaryFP<fsqrt>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * float32.min: minimum (binary operator); if either operand is NaN, returns NaN
- * float32.max: maximum (binary operator); if either operand is NaN, returns NaN
- */
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in
+defm ADD : BinaryFP<fadd, "add ">;
+defm SUB : BinaryFP<fsub, "sub ">;
+let isCommutable = 1 in
+defm MUL : BinaryFP<fmul, "mul ">;
+defm DIV : BinaryFP<fdiv, "div ">;
+defm SQRT : UnaryFP<fsqrt, "sqrt">;
+
+defm ABS : UnaryFP<fabs, "abs ">;
+defm NEG : UnaryFP<fneg, "neg ">;
+defm COPYSIGN : BinaryFP<fcopysign, "copysign">;
+
+let isCommutable = 1 in {
+defm MIN : BinaryFP<fminnan, "min ">;
+defm MAX : BinaryFP<fmaxnan, "max ">;
+} // isCommutable = 1
+
+defm CEIL : UnaryFP<fceil, "ceil">;
+defm FLOOR : UnaryFP<ffloor, "floor">;
+defm TRUNC : UnaryFP<ftrunc, "trunc">;
+defm NEAREST : UnaryFP<fnearbyint, "nearest">;
+
+} // Defs = [ARGUMENTS]
+
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+ (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+ (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
+// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
+def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
+def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in {
+defm EQ : ComparisonFP<SETOEQ, "eq ">;
+defm NE : ComparisonFP<SETUNE, "ne ">;
+} // isCommutable = 1
+defm LT : ComparisonFP<SETOLT, "lt ">;
+defm LE : ComparisonFP<SETOLE, "le ">;
+defm GT : ComparisonFP<SETOGT, "gt ">;
+defm GE : ComparisonFP<SETOGE, "ge ">;
+
+} // Defs = [ARGUMENTS]
+
+// Don't care floating-point comparisons, supported via other comparisons.
+def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs),
+ [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+ "f32.select\t$dst, $cond, $lhs, $rhs">;
+def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
+ [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+ "f64.select\t$dst, $cond, $lhs, $rhs">;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
+ (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
+ (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
+ (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
+ (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 513c36fa2ec2..8008dd32353a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -1,4 +1,4 @@
-// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-//
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -12,44 +12,68 @@
///
//===----------------------------------------------------------------------===//
-// WebAssembly Instruction Format
-class WebAssemblyInst<string cstr> : Instruction {
+// WebAssembly Instruction Format.
+class WebAssemblyInst<string asmstr> : Instruction {
field bits<0> Inst; // Instruction encoding.
let Namespace = "WebAssembly";
let Pattern = [];
- let Constraints = cstr;
+ let AsmString = asmstr;
}
-// Normal instructions
-class I<dag oops, dag iops, list<dag> pattern, string cstr = "">
- : WebAssemblyInst<cstr> {
+// Normal instructions.
+class I<dag oops, dag iops, list<dag> pattern, string asmstr = "">
+ : WebAssemblyInst<asmstr> {
dag OutOperandList = oops;
dag InOperandList = iops;
let Pattern = pattern;
}
// Unary and binary instructions, for the local types that WebAssembly supports.
-multiclass UnaryInt<SDNode node> {
- def _I32 : I<(outs Int32:$dst), (ins Int32:$src),
- [(set Int32:$dst, (node Int32:$src))]>;
- def _I64 : I<(outs Int64:$dst), (ins Int64:$src),
- [(set Int64:$dst, (node Int64:$src))]>;
-}
-multiclass BinaryInt<SDNode node> {
- def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs),
- [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>;
- def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs),
- [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>;
-}
-multiclass UnaryFP<SDNode node> {
- def _F32 : I<(outs Float32:$dst), (ins Float32:$src),
- [(set Float32:$dst, (node Float32:$src))]>;
- def _F64 : I<(outs Float64:$dst), (ins Float64:$src),
- [(set Float64:$dst, (node Float64:$src))]>;
-}
-multiclass BinaryFP<SDNode node> {
- def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs),
- [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>;
- def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs),
- [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>;
+multiclass UnaryInt<SDNode node, string name> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$src),
+ [(set I32:$dst, (node I32:$src))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $src"))>;
+ def _I64 : I<(outs I64:$dst), (ins I64:$src),
+ [(set I64:$dst, (node I64:$src))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $src"))>;
+}
+multiclass BinaryInt<SDNode node, string name> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+ [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
+ [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass UnaryFP<SDNode node, string name> {
+ def _F32 : I<(outs F32:$dst), (ins F32:$src),
+ [(set F32:$dst, (node F32:$src))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $src"))>;
+ def _F64 : I<(outs F64:$dst), (ins F64:$src),
+ [(set F64:$dst, (node F64:$src))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $src"))>;
+}
+multiclass BinaryFP<SDNode node, string name> {
+ def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
+ [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
+ [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass ComparisonInt<CondCode cond, string name> {
+ def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+ [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+ !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
+ [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+ !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+}
+multiclass ComparisonFP<CondCode cond, string name> {
+ def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
+ [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+ !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+ def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
+ [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+ !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index ea8937c8f9f2..5e7663cdb506 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -24,5 +24,136 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-instr-info"
+#define GET_INSTRINFO_CTOR_DTOR
+#include "WebAssemblyGenInstrInfo.inc"
+
WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
- : RI(STI.getTargetTriple()) {}
+ : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+ WebAssembly::ADJCALLSTACKUP),
+ RI(STI.getTargetTriple()) {}
+
+void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ DebugLoc DL, unsigned DestReg,
+ unsigned SrcReg, bool KillSrc) const {
+ // This method is called by post-RA expansion, which expects only pregs to
+ // exist. However we need to handle both here.
+ auto &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI.getRegClass(DestReg) :
+ MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg);
+
+ unsigned CopyLocalOpcode;
+ if (RC == &WebAssembly::I32RegClass)
+ CopyLocalOpcode = WebAssembly::COPY_LOCAL_I32;
+ else if (RC == &WebAssembly::I64RegClass)
+ CopyLocalOpcode = WebAssembly::COPY_LOCAL_I64;
+ else if (RC == &WebAssembly::F32RegClass)
+ CopyLocalOpcode = WebAssembly::COPY_LOCAL_F32;
+ else if (RC == &WebAssembly::F64RegClass)
+ CopyLocalOpcode = WebAssembly::COPY_LOCAL_F64;
+ else
+ llvm_unreachable("Unexpected register class");
+
+ BuildMI(MBB, I, DL, get(CopyLocalOpcode), DestReg)
+ .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
+}
+
+// Branch analysis.
+bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool /*AllowModify*/) const {
+ bool HaveCond = false;
+ for (MachineInstr &MI : MBB.terminators()) {
+ switch (MI.getOpcode()) {
+ default:
+ // Unhandled instruction; bail out.
+ return true;
+ case WebAssembly::BR_IF:
+ if (HaveCond)
+ return true;
+ Cond.push_back(MachineOperand::CreateImm(true));
+ Cond.push_back(MI.getOperand(0));
+ TBB = MI.getOperand(1).getMBB();
+ HaveCond = true;
+ break;
+ case WebAssembly::BR_UNLESS:
+ if (HaveCond)
+ return true;
+ Cond.push_back(MachineOperand::CreateImm(false));
+ Cond.push_back(MI.getOperand(0));
+ TBB = MI.getOperand(1).getMBB();
+ HaveCond = true;
+ break;
+ case WebAssembly::BR:
+ if (!HaveCond)
+ TBB = MI.getOperand(0).getMBB();
+ else
+ FBB = MI.getOperand(0).getMBB();
+ break;
+ }
+ if (MI.isBarrier())
+ break;
+ }
+
+ return false;
+}
+
+unsigned WebAssemblyInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ unsigned Count = 0;
+
+ while (I != MBB.instr_begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (!I->isTerminator())
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.instr_end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ ArrayRef<MachineOperand> Cond,
+ DebugLoc DL) const {
+ if (Cond.empty()) {
+ if (!TBB)
+ return 0;
+
+ BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB);
+ return 1;
+ }
+
+ assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+ if (Cond[0].getImm()) {
+ BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
+ .addOperand(Cond[1])
+ .addMBB(TBB);
+ } else {
+ BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+ .addOperand(Cond[1])
+ .addMBB(TBB);
+ }
+ if (!FBB)
+ return 1;
+
+ BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB);
+ return 2;
+}
+
+bool WebAssemblyInstrInfo::ReverseBranchCondition(
+ SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 2 && "Expected a flag and a successor block");
+ Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+ return false;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 1c4ae22f16d6..5ddd9b36f243 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -19,17 +19,35 @@
#include "WebAssemblyRegisterInfo.h"
#include "llvm/Target/TargetInstrInfo.h"
+#define GET_INSTRINFO_HEADER
+#include "WebAssemblyGenInstrInfo.inc"
+
namespace llvm {
class WebAssemblySubtarget;
-class WebAssemblyInstrInfo final {
+class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
const WebAssemblyRegisterInfo RI;
public:
explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI);
const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
+
+ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+ bool KillSrc) const override;
+
+ bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify = false) const override;
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+ DebugLoc DL) const override;
+ bool
+ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index fe3ca76dc08a..f0b4ce7caf51 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,20 +25,48 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
// WebAssembly-specific DAG Node Types.
//===----------------------------------------------------------------------===//
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqEnd :
+ SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<0>]>;
+
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Nodes.
//===----------------------------------------------------------------------===//
+def WebAssemblycallseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def WebAssemblycallseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
+ SDT_WebAssemblyCall0,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
+ SDT_WebAssemblyCall1,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH",
+ SDT_WebAssemblyTableswitch,
+ [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
+ SDT_WebAssemblyArgument>;
+def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN",
+ SDT_WebAssemblyReturn, [SDNPHasChain]>;
+def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper",
+ SDT_WebAssemblyWrapper>;
+
//===----------------------------------------------------------------------===//
// WebAssembly-specific Operands.
//===----------------------------------------------------------------------===//
-/*
- * TODO(jfb): Add the following.
- *
- * get_local: read the current value of a local variable
- * set_local: set the current value of a local variable
-*/
+def bb_op : Operand<OtherVT>;
//===----------------------------------------------------------------------===//
// WebAssembly Instruction Format Definitions.
@@ -47,13 +75,86 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
include "WebAssemblyInstrFormats.td"
//===----------------------------------------------------------------------===//
+// Additional instructions.
+//===----------------------------------------------------------------------===//
+
+multiclass ARGUMENT<WebAssemblyRegClass vt> {
+ let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+ def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+ [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+}
+defm : ARGUMENT<I32>;
+defm : ARGUMENT<I64>;
+defm : ARGUMENT<F32>;
+defm : ARGUMENT<F64>;
+
+let Defs = [ARGUMENTS] in {
+
+// get_local and set_local are not generated by instruction selection; they
+// are implied by virtual register uses and defs in most contexts. However,
+// they are explicitly emitted for special purposes.
+multiclass LOCAL<WebAssemblyRegClass vt> {
+ def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
+ "get_local\t$res, $regno">;
+ // TODO: set_local returns its operand value
+ def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
+ "set_local\t$regno, $src">;
+
+ // COPY_LOCAL is not an actual instruction in wasm, but since we allow
+ // get_local and set_local to be implicit, we can have a COPY_LOCAL which
+ // is actually a no-op because all the work is done in the implied
+ // get_local and set_local.
+ let isAsCheapAsAMove = 1 in
+ def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
+ "copy_local\t$res, $src">;
+}
+defm : LOCAL<I32>;
+defm : LOCAL<I64>;
+defm : LOCAL<F32>;
+defm : LOCAL<F64>;
+
+let isMoveImm = 1 in {
+def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
+ [(set I32:$res, imm:$imm)],
+ "i32.const\t$res, $imm">;
+def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm),
+ [(set I64:$res, imm:$imm)],
+ "i64.const\t$res, $imm">;
+def CONST_F32 : I<(outs F32:$res), (ins f32imm:$imm),
+ [(set F32:$res, fpimm:$imm)],
+ "f32.const\t$res, $imm">;
+def CONST_F64 : I<(outs F64:$res), (ins f64imm:$imm),
+ [(set F64:$res, fpimm:$imm)],
+ "f64.const\t$res, $imm">;
+} // isMoveImm = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)),
+ (CONST_I32 tglobaladdr:$dst)>;
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)),
+ (CONST_I32 texternalsym:$dst)>;
+def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)),
+ (CONST_I32 tjumptable:$dst)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Function signature and local variable declaration "instructions".
+def PARAM : I<(outs), (ins variable_ops), [], ".param \t">;
+def RESULT : I<(outs), (ins variable_ops), [], ".result \t">;
+def LOCAL : I<(outs), (ins variable_ops), [], ".local \t">;
+
+} // Defs = [ARGUMENTS]
+
+//===----------------------------------------------------------------------===//
// Additional sets of instructions.
//===----------------------------------------------------------------------===//
include "WebAssemblyInstrMemory.td"
include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrControl.td"
include "WebAssemblyInstrInteger.td"
-include "WebAssemblyInstrFloat.td"
include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
include "WebAssemblyInstrAtomics.td"
include "WebAssemblyInstrSIMD.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 5f60fe81b1a2..09e5eafb85e9 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -12,34 +12,77 @@
///
//===----------------------------------------------------------------------===//
-defm ADD : BinaryInt<add>;
-defm SUB : BinaryInt<sub>;
-defm MUL : BinaryInt<mul>;
-defm SDIV : BinaryInt<sdiv>;
-defm UDIV : BinaryInt<udiv>;
-defm SREM : BinaryInt<srem>;
-defm UREM : BinaryInt<urem>;
-defm AND : BinaryInt<and>;
-defm IOR : BinaryInt<or>;
-defm XOR : BinaryInt<xor>;
-defm SHL : BinaryInt<shl>;
-defm SHR : BinaryInt<srl>;
-defm SAR : BinaryInt<sra>;
-
-/*
- * TODO(jfb): Add the following for 32-bit and 64-bit.
- *
- * int32.eq: signed-less compare equal
- * int32.slt: signed less than
- * int32.sle: signed less than or equal
- * int32.ult: unsigned less than
- * int32.ule: unsigned less than or equal
- * int32.sgt: signed greater than
- * int32.sge: signed greater than or equal
- * int32.ugt: unsigned greater than
- * int32.uge: unsigned greater than or equal
- */
-
-defm CLZ : UnaryInt<ctlz>;
-defm CTZ : UnaryInt<cttz>;
-defm POPCNT : UnaryInt<ctpop>;
+let Defs = [ARGUMENTS] in {
+
+// The spaces after the names are for aesthetic purposes only, to make
+// operands line up vertically after tab expansion.
+let isCommutable = 1 in
+defm ADD : BinaryInt<add, "add ">;
+defm SUB : BinaryInt<sub, "sub ">;
+let isCommutable = 1 in
+defm MUL : BinaryInt<mul, "mul ">;
+// Divide and remainder trap on a zero denominator.
+let hasSideEffects = 1 in {
+defm DIV_S : BinaryInt<sdiv, "div_s">;
+defm DIV_U : BinaryInt<udiv, "div_u">;
+defm REM_S : BinaryInt<srem, "rem_s">;
+defm REM_U : BinaryInt<urem, "rem_u">;
+} // hasSideEffects = 1
+let isCommutable = 1 in {
+defm AND : BinaryInt<and, "and ">;
+defm OR : BinaryInt<or, "or ">;
+defm XOR : BinaryInt<xor, "xor ">;
+} // isCommutable = 1
+defm SHL : BinaryInt<shl, "shl ">;
+defm SHR_U : BinaryInt<srl, "shr_u">;
+defm SHR_S : BinaryInt<sra, "shr_s">;
+
+let isCommutable = 1 in {
+defm EQ : ComparisonInt<SETEQ, "eq ">;
+defm NE : ComparisonInt<SETNE, "ne ">;
+} // isCommutable = 1
+defm LT_S : ComparisonInt<SETLT, "lt_s">;
+defm LE_S : ComparisonInt<SETLE, "le_s">;
+defm LT_U : ComparisonInt<SETULT, "lt_u">;
+defm LE_U : ComparisonInt<SETULE, "le_u">;
+defm GT_S : ComparisonInt<SETGT, "gt_s">;
+defm GE_S : ComparisonInt<SETGE, "ge_s">;
+defm GT_U : ComparisonInt<SETUGT, "gt_u">;
+defm GE_U : ComparisonInt<SETUGE, "ge_u">;
+
+defm CLZ : UnaryInt<ctlz, "clz ">;
+defm CTZ : UnaryInt<cttz, "ctz ">;
+defm POPCNT : UnaryInt<ctpop, "popcnt">;
+
+} // Defs = [ARGUMENTS]
+
+// Expand the "don't care" operations to supported operations.
+def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>;
+def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>;
+def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>;
+def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs),
+ [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+ "i32.select\t$dst, $cond, $lhs, $rhs">;
+def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
+ [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+ "i64.select\t$dst, $cond, $lhs, $rhs">;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
+ (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
+ (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
+ (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
+ (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 5ab40e826caa..74ec45d58644 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -12,35 +12,500 @@
///
//===----------------------------------------------------------------------===//
-/*
- * TODO(jfb): Add the following.
- * Each has optional alignment and immediate byte offset.
- *
- * int32.load_sx[int8]: sign-extend to int32
- * int32.load_sx[int16]: sign-extend to int32
- * int32.load_zx[int8]: zero-extend to int32
- * int32.load_zx[int16]: zero-extend to int32
- * int32.load[int32]: (no conversion)
- * int64.load_sx[int8]: sign-extend to int64
- * int64.load_sx[int16]: sign-extend to int64
- * int64.load_sx[int32]: sign-extend to int64
- * int64.load_zx[int8]: zero-extend to int64
- * int64.load_zx[int16]: zero-extend to int64
- * int64.load_zx[int32]: zero-extend to int64
- * int64.load[int64]: (no conversion)
- * float32.load[float32]: (no conversion)
- * float64.load[float64]: (no conversion)
- *
- * int32.store[int8]: wrap int32 to int8
- * int32.store[int16]: wrap int32 to int16
- * int32.store[int32]: (no conversion)
- * int64.store[int8]: wrap int64 to int8
- * int64.store[int16]: wrap int64 to int16
- * int64.store[int32]: wrap int64 to int32
- * int64.store[int64]: (no conversion)
- * float32.store[float32]: (no conversion)
- * float64.store[float64]: (no conversion)
- *
- * load_global: load the value of a given global variable
- * store_global: store a given value to a given global variable
- */
+// TODO:
+// - HasAddr64
+// - WebAssemblyTargetLowering having to do with atomics
+// - Each has optional alignment.
+
+// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16
+// local types. These memory-only types instead zero- or sign-extend into local
+// types when loading, and truncate when storing.
+
+// WebAssembly constant offsets are performed as unsigned with infinite
+// precision, so we need to check for NoUnsignedWrap so that we don't fold an
+// offset for an add that needs wrapping.
+def regPlusImm : PatFrag<(ops node:$off, node:$addr),
+ (add node:$addr, node:$off),
+ [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
+
+let Defs = [ARGUMENTS] in {
+
+// Basic load.
+def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i32.load\t$dst, ${off}(${addr})">;
+def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load\t$dst, ${off}(${addr})">;
+def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "f32.load\t$dst, ${off}(${addr})">;
+def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "f64.load\t$dst, ${off}(${addr})">;
+
+} // Defs = [ARGUMENTS]
+
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+
+// Select loads with a constant offset.
+def : Pat<(i32 (load (regPlusImm imm:$off, I32:$addr))),
+ (LOAD_I32 imm:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm imm:$off, I32:$addr))),
+ (LOAD_I64 imm:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm imm:$off, I32:$addr))),
+ (LOAD_F32 imm:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm imm:$off, I32:$addr))),
+ (LOAD_F64 imm:$off, $addr)>;
+def : Pat<(i32 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD_F32 tglobaladdr:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD_F64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (load (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD_I64 texternalsym:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD_F32 texternalsym:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD_F64 texternalsym:$off, $addr)>;
+
+// Select loads with just a constant offset.
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_F32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
+ (LOAD_F64 texternalsym:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Extending load.
+def LOAD8_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i32.load8_s\t$dst, ${off}(${addr})">;
+def LOAD8_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i32.load8_u\t$dst, ${off}(${addr})">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i32.load16_s\t$dst, ${off}(${addr})">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i32.load16_u\t$dst, ${off}(${addr})">;
+def LOAD8_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load8_s\t$dst, ${off}(${addr})">;
+def LOAD8_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load8_u\t$dst, ${off}(${addr})">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load16_s\t$dst, ${off}(${addr})">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load16_u\t$dst, ${off}(${addr})">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load32_s\t$dst, ${off}(${addr})">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
+ "i64.load32_u\t$dst, ${off}(${addr})">;
+
+} // Defs = [ARGUMENTS]
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
+// Select extending loads with a constant offset.
+def : Pat<(i32 (sextloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_S_I32 imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_U_I32 imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_S_I32 imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_U_I32 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD32_S_I64 imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD32_U_I64 imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_S_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_S_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD32_S_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_S_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_S_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD32_S_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD32_U_I64 texternalsym:$off, $addr)>;
+
+// Select extending loads with just a constant offset.
+def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>;
+
+// Resolve "don't care" extending loads to zero-extending loads. This is
+// somewhat arbitrary, but zero-extending is conceptually simpler.
+
+// Select "don't care" extending loads with no constant offset.
+def : Pat<(i32 (extloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
+// Select "don't care" extending loads with a constant offset.
+def : Pat<(i32 (extloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_U_I32 imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_U_I32 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD8_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD16_U_I64 imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm imm:$off, I32:$addr))),
+ (LOAD32_U_I64 imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))),
+ (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_U_I32 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD8_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD16_U_I64 texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm texternalsym:$off, I32:$addr))),
+ (LOAD32_U_I64 texternalsym:$off, $addr)>;
+
+// Select "don't care" extending loads with just a constant offset.
+def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+ (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
+ (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Basic store.
+// Note that we split the patterns out of the instruction definitions because
+// WebAssembly's stores return their operand value, and tablegen doesn't like
+// instruction definition patterns that don't reference all of the output
+// operands.
+// Note: WebAssembly inverts SelectionDAG's usual operand order.
+def STORE_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+ "i32.store\t$dst, ${off}(${addr}), $val">;
+def STORE_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+ "i64.store\t$dst, ${off}(${addr}), $val">;
+def STORE_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [],
+ "f32.store\t$dst, ${off}(${addr}), $val">;
+def STORE_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [],
+ "f64.store\t$dst, ${off}(${addr}), $val">;
+
+} // Defs = [ARGUMENTS]
+
+// Select stores with no constant offset.
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+
+// Select stores with a constant offset.
+def : Pat<(store I32:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE_F32 imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE_F64 imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>;
+
+// Select stores with just a constant offset.
+def : Pat<(store I32:$val, imm:$off),
+ (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, imm:$off),
+ (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, imm:$off),
+ (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, imm:$off),
+ (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Truncating store.
+def STORE8_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+ "i32.store8\t$dst, ${off}(${addr}), $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
+ "i32.store16\t$dst, ${off}(${addr}), $val">;
+def STORE8_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+ "i64.store8\t$dst, ${off}(${addr}), $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+ "i64.store16\t$dst, ${off}(${addr}), $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
+ "i64.store32\t$dst, ${off}(${addr}), $val">;
+
+} // Defs = [ARGUMENTS]
+
+// Select truncating stores with no constant offset.
+def : Pat<(truncstorei8 I32:$val, I32:$addr),
+ (STORE8_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, I32:$addr),
+ (STORE16_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, I32:$addr),
+ (STORE8_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, I32:$addr),
+ (STORE16_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, I32:$addr),
+ (STORE32_I64 0, I32:$addr, I64:$val)>;
+
+// Select truncating stores with a constant offset.
+def : Pat<(truncstorei8 I32:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE8_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE16_I32 imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE8_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE16_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm imm:$off, I32:$addr)),
+ (STORE32_I64 imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)),
+ (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)),
+ (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+
+// Select truncating stores with just a constant offset.
+def : Pat<(truncstorei8 I32:$val, imm:$off),
+ (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, imm:$off),
+ (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, imm:$off),
+ (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, imm:$off),
+ (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, imm:$off),
+ (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+ (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Memory size.
+def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins),
+ [(set I32:$dst, (int_wasm_memory_size))],
+ "memory_size\t$dst">,
+ Requires<[HasAddr32]>;
+def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins),
+ [(set I64:$dst, (int_wasm_memory_size))],
+ "memory_size\t$dst">,
+ Requires<[HasAddr64]>;
+
+// Grow memory.
+def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta),
+ [(int_wasm_grow_memory I32:$delta)],
+ "grow_memory\t$delta">,
+ Requires<[HasAddr32]>;
+def GROW_MEMORY_I64 : I<(outs), (ins I64:$delta),
+ [(int_wasm_grow_memory I64:$delta)],
+ "grow_memory\t$delta">,
+ Requires<[HasAddr64]>;
+
+} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 000000000000..b009a4e054cc
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,133 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+ const char *getPassName() const override {
+ return "WebAssembly Lower br_unless";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+ return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MRI = MF.getRegInfo();
+
+ for (auto &MBB : MF) {
+ for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+ MachineInstr *MI = &*MII++;
+ if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+ continue;
+
+ unsigned Cond = MI->getOperand(0).getReg();
+ bool Inverted = false;
+
+ // Attempt to invert the condition in place.
+ if (MFI.isVRegStackified(Cond)) {
+ assert(MRI.hasOneDef(Cond));
+ MachineInstr *Def = MRI.getVRegDef(Cond);
+ switch (Def->getOpcode()) {
+ using namespace WebAssembly;
+ case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+ case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+ case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+ case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+ case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+ case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+ case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+ case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+ case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+ case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+ case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+ case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+ case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+ case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+ case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+ case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+ case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+ case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+ case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+ case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+ case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+ case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+ case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+ case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+ default: break;
+ }
+ }
+
+ // If we weren't able to invert the condition in place. Insert an
+ // expression to invert it.
+ if (!Inverted) {
+ unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ MFI.stackifyVReg(ZeroReg);
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
+ .addImm(0);
+ unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ MFI.stackifyVReg(Tmp);
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
+ .addReg(Cond)
+ .addReg(ZeroReg);
+ Cond = Tmp;
+ Inverted = true;
+ }
+
+ // The br_unless condition has now been inverted. Insert a br_if and
+ // delete the br_unless.
+ assert(Inverted);
+ BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+ .addReg(Cond)
+ .addOperand(MI->getOperand(1));
+ MBB.erase(MI);
+ }
+ }
+
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
new file mode 100644
index 000000000000..a953f8247006
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -0,0 +1,106 @@
+// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst //
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+ return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
+ const MachineOperand &MO) const {
+ return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags");
+
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+ int64_t Offset = MO.getOffset();
+ if (Offset != 0) {
+ assert(!MO.isJTI() && "Unexpected offset with jump table index");
+ Expr =
+ MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+ }
+
+ return MCOperand::createExpr(Expr);
+}
+
+void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
+ MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register: {
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
+ continue;
+ const WebAssemblyFunctionInfo &MFI =
+ *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+ unsigned WAReg = MFI.getWAReg(MO.getReg());
+ MCOp = MCOperand::createReg(WAReg);
+ break;
+ }
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ // TODO: MC converts all floating point immediate operands to double.
+ // This is fine for numeric values, but may cause NaNs to change bits.
+ const ConstantFP *Imm = MO.getFPImm();
+ if (Imm->getType()->isFloatTy())
+ MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat());
+ else if (Imm->getType()->isDoubleTy())
+ MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble());
+ else
+ llvm_unreachable("unknown floating point immediate type");
+ break;
+ }
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
+ MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+ break;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
new file mode 100644
index 000000000000..6d704704f576
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -0,0 +1,45 @@
+//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// their corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCSymbol;
+class MachineInstr;
+class MachineOperand;
+
+/// This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
+ MCContext &Ctx;
+ AsmPrinter &Printer;
+
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+ MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+ MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+
+public:
+ WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+ : Ctx(ctx), Printer(printer) {}
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 542d984b9006..225c5d32cb5d 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -17,3 +17,9 @@
using namespace llvm;
WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {}
+
+void WebAssemblyFunctionInfo::initWARegs() {
+ assert(WARegs.empty());
+ unsigned Reg = UnusedReg;
+ WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index fc5e910b09ef..6a60280900a9 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-// WebAssemblyMachineFuctionInfo.h-WebAssembly machine function info -*- C++ -*-
+// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*-
//
// The LLVM Compiler Infrastructure
//
@@ -16,8 +16,7 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
-#include "WebAssemblyRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -27,9 +26,70 @@ namespace llvm {
class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
MachineFunction &MF;
+ std::vector<MVT> Params;
+
+ /// A mapping from CodeGen vreg index to WebAssembly register number.
+ std::vector<unsigned> WARegs;
+
+ /// A mapping from CodeGen vreg index to a boolean value indicating whether
+ /// the given register is considered to be "stackified", meaning it has been
+ /// determined or made to meet the stack requirements:
+ /// - single use (per path)
+ /// - single def (per path)
+ /// - defined and used in LIFO order with other stack registers
+ BitVector VRegStackified;
+
+ // One entry for each possible target reg. we expect it to be small.
+ std::vector<unsigned> PhysRegs;
+
public:
- explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
+ explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {
+ PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U);
+ }
~WebAssemblyFunctionInfo() override;
+
+ void addParam(MVT VT) { Params.push_back(VT); }
+ const std::vector<MVT> &getParams() const { return Params; }
+
+ static const unsigned UnusedReg = -1u;
+
+ void stackifyVReg(unsigned VReg) {
+ if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+ VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
+ VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+ }
+ bool isVRegStackified(unsigned VReg) const {
+ if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+ return false;
+ return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+ }
+
+ void initWARegs();
+ void setWAReg(unsigned VReg, unsigned WAReg) {
+ assert(WAReg != UnusedReg);
+ assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
+ WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+ }
+ unsigned getWAReg(unsigned Reg) const {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+ return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+ }
+ return PhysRegs[Reg];
+ }
+ // If new virtual registers are created after initWARegs has been called,
+ // this function can be used to add WebAssembly register mappings for them.
+ void addWAReg(unsigned VReg, unsigned WAReg) {
+ assert(VReg = WARegs.size());
+ WARegs.push_back(WAReg);
+ }
+
+ void addPReg(unsigned PReg, unsigned WAReg) {
+ assert(PReg < WebAssembly::NUM_TARGET_REGS);
+ assert(WAReg < -1U);
+ PhysRegs[PReg] = WAReg;
+ }
+ const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; }
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
new file mode 100644
index 000000000000..4dc401a2c7cc
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -0,0 +1,76 @@
+//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize calls with "returned" attributes for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-returned"
+
+namespace {
+class OptimizeReturned final : public FunctionPass,
+ public InstVisitor<OptimizeReturned> {
+ const char *getPassName() const override {
+ return "WebAssembly Optimize Returned";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ DominatorTree *DT;
+
+public:
+ static char ID;
+ OptimizeReturned() : FunctionPass(ID), DT(nullptr) {}
+
+ void visitCallSite(CallSite CS);
+};
+} // End anonymous namespace
+
+char OptimizeReturned::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
+ return new OptimizeReturned();
+}
+
+void OptimizeReturned::visitCallSite(CallSite CS) {
+ for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
+ if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+ Instruction *Inst = CS.getInstruction();
+ Value *Arg = CS.getArgOperand(i);
+ // Ignore constants, globals, undef, etc.
+ if (isa<Constant>(Arg))
+ continue;
+ // Like replaceDominatedUsesWith but using Instruction/Use dominance.
+ for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
+ Use &U = *UI++;
+ if (DT->dominates(Inst, U))
+ U.set(Inst);
+ }
+ }
+}
+
+bool OptimizeReturned::runOnFunction(Function &F) {
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ visit(F);
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
new file mode 100644
index 000000000000..d570d4266110
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
@@ -0,0 +1,1066 @@
+//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation. After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does
+// not assert that all virtual registers are gone (because WebAssembly currently
+// uses virtual rather than physical registers), and only runs
+// MRI.clearVirtRegs() if scavenging happened (which it never does). It also
+// uses a different class name so it can be registered via INITIALIZE_PASS.
+// It is otherwise unmodified, so any changes to the target-independent PEI
+// can be easily applied.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pei"
+namespace llvm {
+void initializeWasmPEIPass(PassRegistry&);
+}
+namespace {
+class WasmPEI : public MachineFunctionPass {
+public:
+ static char ID;
+ WasmPEI() : MachineFunctionPass(ID) {
+ initializeWasmPEIPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+ /// frame indexes with appropriate references.
+ ///
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+ RegScavenger *RS;
+
+ // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+ // stack frame indexes.
+ unsigned MinCSFrameIndex, MaxCSFrameIndex;
+
+ // Save and Restore blocks of the current function. Typically there is a
+ // single save block, unless Windows EH funclets are involved.
+ SmallVector<MachineBasicBlock *, 1> SaveBlocks;
+ SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
+
+ // Flag to control whether to use the register scavenger to resolve
+ // frame index materialization registers. Set according to
+ // TRI->requiresFrameIndexScavenging() for the current function.
+ bool FrameIndexVirtualScavenging;
+
+ void calculateSets(MachineFunction &Fn);
+ void calculateCallsInformation(MachineFunction &Fn);
+ void assignCalleeSavedSpillSlots(MachineFunction &Fn,
+ const BitVector &SavedRegs);
+ void insertCSRSpillsAndRestores(MachineFunction &Fn);
+ void calculateFrameObjectOffsets(MachineFunction &Fn);
+ void replaceFrameIndices(MachineFunction &Fn);
+ void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+ int &SPAdj);
+ void scavengeFrameVirtualRegs(MachineFunction &Fn);
+ void insertPrologEpilogCode(MachineFunction &Fn);
+};
+} // namespace
+
+char WasmPEI::ID = 0;
+
+namespace llvm {
+FunctionPass *createWebAssemblyPEI() {
+ return new WasmPEI();
+}
+}
+
+static cl::opt<unsigned>
+WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
+ cl::desc("Warn for stack size bigger than the given"
+ " number"));
+
+INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog",
+ "Wasm Prologue/Epilogue Insertion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog",
+ "Wasm Prologue/Epilogue Insertion & Frame Finalization",
+ false, false)
+
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+STATISTIC(NumBytesStackSpace,
+ "Number of bytes used for stack in all functions");
+
+void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addPreserved<MachineLoopInfo>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<StackProtector>();
+ AU.addRequired<TargetPassConfig>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Compute the set of return blocks
+void WasmPEI::calculateSets(MachineFunction &Fn) {
+ const MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+ // Even when we do not change any CSR, we still want to insert the
+ // prologue and epilogue of the function.
+ // So set the save points for those.
+
+ // Use the points found by shrink-wrapping, if any.
+ if (MFI->getSavePoint()) {
+ SaveBlocks.push_back(MFI->getSavePoint());
+ assert(MFI->getRestorePoint() && "Both restore and save must be set");
+ MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
+ // If RestoreBlock does not have any successor and is not a return block
+ // then the end point is unreachable and we do not need to insert any
+ // epilogue.
+ if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+ RestoreBlocks.push_back(RestoreBlock);
+ return;
+ }
+
+ // Save refs to entry and return blocks.
+ SaveBlocks.push_back(&Fn.front());
+ for (MachineBasicBlock &MBB : Fn) {
+ if (MBB.isEHFuncletEntry())
+ SaveBlocks.push_back(&MBB);
+ if (MBB.isReturnBlock())
+ RestoreBlocks.push_back(&MBB);
+ }
+}
+
+/// StackObjSet - A set of stack object indexes
+typedef SmallSetVector<int, 8> StackObjSet;
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) {
+ const Function* F = Fn.getFunction();
+ const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+ const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+
+ // LOCALMOD: assert removed from target-independent PEI
+ //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
+
+ RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
+ FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+
+ // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+ // function's frame information. Also eliminates call frame pseudo
+ // instructions.
+ calculateCallsInformation(Fn);
+
+ // Determine which of the registers in the callee save list should be saved.
+ BitVector SavedRegs;
+ TFI->determineCalleeSaves(Fn, SavedRegs, RS);
+
+ // Insert spill code for any callee saved registers that are modified.
+ assignCalleeSavedSpillSlots(Fn, SavedRegs);
+
+ // Determine placement of CSR spill/restore code:
+ // place all spills in the entry block, all restores in return blocks.
+ calculateSets(Fn);
+
+ // Add the code to save and restore the callee saved registers.
+ if (!F->hasFnAttribute(Attribute::Naked))
+ insertCSRSpillsAndRestores(Fn);
+
+ // Allow the target machine to make final modifications to the function
+ // before the frame layout is finalized.
+ TFI->processFunctionBeforeFrameFinalized(Fn, RS);
+
+ // Calculate actual frame offsets for all abstract stack objects...
+ calculateFrameObjectOffsets(Fn);
+
+ // Add prolog and epilog code to the function. This function is required
+ // to align the stack frame as necessary for any stack variables or
+ // called functions. Because of this, calculateCalleeSavedRegisters()
+ // must be called before this function in order to set the AdjustsStack
+ // and MaxCallFrameSize variables.
+ if (!F->hasFnAttribute(Attribute::Naked))
+ insertPrologEpilogCode(Fn);
+
+ // Replace all MO_FrameIndex operands with physical register references
+ // and actual offsets.
+ //
+ replaceFrameIndices(Fn);
+
+ // If register scavenging is needed, as we've enabled doing it as a
+ // post-pass, scavenge the virtual registers that frame index elimination
+ // inserted.
+ if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
+ scavengeFrameVirtualRegs(Fn);
+ // Clear any vregs created by virtual scavenging.
+ // LOCALMOD: made this call conditional with scavengeFrameVirtualregs()
+ Fn.getRegInfo().clearVirtRegs();
+ }
+
+ // Warn on stack size when we exceeds the given limit.
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+ uint64_t StackSize = MFI->getStackSize();
+ if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
+ DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
+ F->getContext().diagnose(DiagStackSize);
+ }
+
+ delete RS;
+ SaveBlocks.clear();
+ RestoreBlocks.clear();
+ return true;
+}
+
+/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void WasmPEI::calculateCallsInformation(MachineFunction &Fn) {
+ const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+ unsigned MaxCallFrameSize = 0;
+ bool AdjustsStack = MFI->adjustsStack();
+
+ // Get the function call frame set-up and tear-down instruction opcode
+ unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+ // Early exit for targets which have no call frame setup/destroy pseudo
+ // instructions.
+ if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+ return;
+
+ std::vector<MachineBasicBlock::iterator> FrameSDOps;
+ for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+ if (I->getOpcode() == FrameSetupOpcode ||
+ I->getOpcode() == FrameDestroyOpcode) {
+ assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+ " instructions should have a single immediate argument!");
+ unsigned Size = I->getOperand(0).getImm();
+ if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+ AdjustsStack = true;
+ FrameSDOps.push_back(I);
+ } else if (I->isInlineAsm()) {
+ // Some inline asm's need a stack frame, as indicated by operand 1.
+ unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+ if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+ AdjustsStack = true;
+ }
+
+ MFI->setAdjustsStack(AdjustsStack);
+ MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+ for (std::vector<MachineBasicBlock::iterator>::iterator
+ i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
+ MachineBasicBlock::iterator I = *i;
+
+ // If call frames are not being included as part of the stack frame, and
+ // the target doesn't indicate otherwise, remove the call frame pseudos
+ // here. The sub/add sp instruction pairs are still inserted, but we don't
+ // need to track the SP adjustment for frame index elimination.
+ if (TFI->canSimplifyCallFramePseudos(Fn))
+ TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+ }
+}
+
+void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F,
+ const BitVector &SavedRegs) {
+ // These are used to keep track the callee-save area. Initialize them.
+ MinCSFrameIndex = INT_MAX;
+ MaxCSFrameIndex = 0;
+
+ if (SavedRegs.empty())
+ return;
+
+ const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+ const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+
+ std::vector<CalleeSavedInfo> CSI;
+ for (unsigned i = 0; CSRegs[i]; ++i) {
+ unsigned Reg = CSRegs[i];
+ if (SavedRegs.test(Reg))
+ CSI.push_back(CalleeSavedInfo(Reg));
+ }
+
+ const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
+ MachineFrameInfo *MFI = F.getFrameInfo();
+ if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
+ // If target doesn't implement this, use generic code.
+
+ if (CSI.empty())
+ return; // Early exit if no callee saved registers are modified!
+
+ unsigned NumFixedSpillSlots;
+ const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+ TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+ // Now that we know which registers need to be saved and restored, allocate
+ // stack slots for them.
+ for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
+ I != E; ++I) {
+ unsigned Reg = I->getReg();
+ const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+ int FrameIdx;
+ if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+ I->setFrameIdx(FrameIdx);
+ continue;
+ }
+
+ // Check to see if this physreg must be spilled to a particular stack slot
+ // on this target.
+ const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+ while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
+ FixedSlot->Reg != Reg)
+ ++FixedSlot;
+
+ if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+ // Nope, just spill it anywhere convenient.
+ unsigned Align = RC->getAlignment();
+ unsigned StackAlign = TFI->getStackAlignment();
+
+ // We may not be able to satisfy the desired alignment specification of
+ // the TargetRegisterClass if the stack alignment is smaller. Use the
+ // min.
+ Align = std::min(Align, StackAlign);
+ FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+ if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+ if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+ } else {
+ // Spill it to the stack where we must.
+ FrameIdx =
+ MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
+ }
+
+ I->setFrameIdx(FrameIdx);
+ }
+ }
+
+ MFI->setCalleeSavedInfo(CSI);
+}
+
+/// Helper function to update the liveness information for the callee-saved
+/// registers.
+static void updateLiveness(MachineFunction &MF) {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ // Visited will contain all the basic blocks that are in the region
+ // where the callee saved registers are alive:
+ // - Anything that is not Save or Restore -> LiveThrough.
+ // - Save -> LiveIn.
+ // - Restore -> LiveOut.
+ // The live-out is not attached to the block, so no need to keep
+ // Restore in this set.
+ SmallPtrSet<MachineBasicBlock *, 8> Visited;
+ SmallVector<MachineBasicBlock *, 8> WorkList;
+ MachineBasicBlock *Entry = &MF.front();
+ MachineBasicBlock *Save = MFI->getSavePoint();
+
+ if (!Save)
+ Save = Entry;
+
+ if (Entry != Save) {
+ WorkList.push_back(Entry);
+ Visited.insert(Entry);
+ }
+ Visited.insert(Save);
+
+ MachineBasicBlock *Restore = MFI->getRestorePoint();
+ if (Restore)
+ // By construction Restore cannot be visited, otherwise it
+ // means there exists a path to Restore that does not go
+ // through Save.
+ WorkList.push_back(Restore);
+
+ while (!WorkList.empty()) {
+ const MachineBasicBlock *CurBB = WorkList.pop_back_val();
+ // By construction, the region that is after the save point is
+ // dominated by the Save and post-dominated by the Restore.
+ if (CurBB == Save && Save != Restore)
+ continue;
+ // Enqueue all the successors not already visited.
+ // Those are by construction either before Save or after Restore.
+ for (MachineBasicBlock *SuccBB : CurBB->successors())
+ if (Visited.insert(SuccBB).second)
+ WorkList.push_back(SuccBB);
+ }
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ for (MachineBasicBlock *MBB : Visited) {
+ MCPhysReg Reg = CSI[i].getReg();
+ // Add the callee-saved register as live-in.
+ // It's killed at the spill.
+ if (!MBB->isLiveIn(Reg))
+ MBB->addLiveIn(Reg);
+ }
+ }
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function.
+///
+void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+ // Get callee saved register information.
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+ MFI->setCalleeSavedInfoValid(true);
+
+ // Early exit if no callee saved registers are modified!
+ if (CSI.empty())
+ return;
+
+ const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+ MachineBasicBlock::iterator I;
+
+ // Spill using target interface.
+ for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+ I = SaveBlock->begin();
+ if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ // Insert the spill to the stack frame.
+ unsigned Reg = CSI[i].getReg();
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
+ RC, TRI);
+ }
+ }
+ // Update the live-in information of all the blocks up to the save point.
+ updateLiveness(Fn);
+ }
+
+ // Restore using target interface.
+ for (MachineBasicBlock *MBB : RestoreBlocks) {
+ I = MBB->end();
+
+ // Skip over all terminator instructions, which are part of the return
+ // sequence.
+ MachineBasicBlock::iterator I2 = I;
+ while (I2 != MBB->begin() && (--I2)->isTerminator())
+ I = I2;
+
+ bool AtStart = I == MBB->begin();
+ MachineBasicBlock::iterator BeforeI = I;
+ if (!AtStart)
+ --BeforeI;
+
+ // Restore all registers immediately before the return and any
+ // terminators that precede it.
+ if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+ assert(I != MBB->begin() &&
+ "loadRegFromStackSlot didn't insert any code!");
+ // Insert in reverse order. loadRegFromStackSlot can insert
+ // multiple instructions.
+ if (AtStart)
+ I = MBB->begin();
+ else {
+ I = BeforeI;
+ ++I;
+ }
+ }
+ }
+ }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+ bool StackGrowsDown, int64_t &Offset,
+ unsigned &MaxAlign, unsigned Skew) {
+ // If the stack grows down, add the object size to find the lowest address.
+ if (StackGrowsDown)
+ Offset += MFI->getObjectSize(FrameIdx);
+
+ unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+ // If the alignment of this object is greater than that of the stack, then
+ // increase the stack alignment to match.
+ MaxAlign = std::max(MaxAlign, Align);
+
+ // Adjust to alignment boundary.
+ Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+ if (StackGrowsDown) {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+ } else {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, Offset);
+ Offset += MFI->getObjectSize(FrameIdx);
+ }
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+static void
+AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+ SmallSet<int, 16> &ProtectedObjs,
+ MachineFrameInfo *MFI, bool StackGrowsDown,
+ int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
+
+ for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
+ E = UnassignedObjs.end(); I != E; ++I) {
+ int i = *I;
+ AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+ ProtectedObjs.insert(i);
+ }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+ StackProtector *SP = &getAnalysis<StackProtector>();
+
+ bool StackGrowsDown =
+ TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+ // Loop over all of the stack objects, assigning sequential addresses...
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+ // Start at the beginning of the local area.
+ // The Offset is the distance from the stack top in the direction
+ // of stack growth -- so it's always nonnegative.
+ int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+ if (StackGrowsDown)
+ LocalAreaOffset = -LocalAreaOffset;
+ assert(LocalAreaOffset >= 0
+ && "Local area offset should be in direction of stack growth");
+ int64_t Offset = LocalAreaOffset;
+
+ // Skew to be applied to alignment.
+ unsigned Skew = TFI.getStackAlignmentSkew(Fn);
+
+ // If there are fixed sized objects that are preallocated in the local area,
+ // non-fixed objects can't be allocated right at the start of local area.
+ // We currently don't support filling in holes in between fixed sized
+ // objects, so we adjust 'Offset' to point to the end of last fixed sized
+ // preallocated object.
+ for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+ int64_t FixedOff;
+ if (StackGrowsDown) {
+ // The maximum distance from the stack pointer is at lower address of
+ // the object -- which is given by offset. For down growing stack
+ // the offset is negative, so we negate the offset to get the distance.
+ FixedOff = -MFI->getObjectOffset(i);
+ } else {
+ // The maximum distance from the start pointer is at the upper
+ // address of the object.
+ FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+ }
+ if (FixedOff > Offset) Offset = FixedOff;
+ }
+
+ // First assign frame offsets to stack objects that are used to spill
+ // callee saved registers.
+ if (StackGrowsDown) {
+ for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+ // If the stack grows down, we need to add the size to find the lowest
+ // address of the object.
+ Offset += MFI->getObjectSize(i);
+
+ unsigned Align = MFI->getObjectAlignment(i);
+ // Adjust to alignment boundary
+ Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+ MFI->setObjectOffset(i, -Offset); // Set the computed offset
+ }
+ } else {
+ int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
+ for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+ unsigned Align = MFI->getObjectAlignment(i);
+ // Adjust to alignment boundary
+ Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+ MFI->setObjectOffset(i, Offset);
+ Offset += MFI->getObjectSize(i);
+ }
+ }
+
+ unsigned MaxAlign = MFI->getMaxAlignment();
+
+ // Make sure the special register scavenging spill slot is closest to the
+ // incoming stack pointer if a frame pointer is required and is closer
+ // to the incoming rather than the final stack pointer.
+ const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+ bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
+ TFI.isFPCloseToIncomingSP() &&
+ RegInfo->useFPForScavengingIndex(Fn) &&
+ !RegInfo->needsStackRealignment(Fn));
+ if (RS && EarlyScavengingSlots) {
+ SmallVector<int, 2> SFIs;
+ RS->getScavengingFrameIndices(SFIs);
+ for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+ IE = SFIs.end(); I != IE; ++I)
+ AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+ }
+
+ // FIXME: Once this is working, then enable flag will change to a target
+ // check for whether the frame is large enough to want to use virtual
+ // frame index registers. Functions which don't want/need this optimization
+ // will continue to use the existing code path.
+ if (MFI->getUseLocalStackAllocationBlock()) {
+ unsigned Align = MFI->getLocalFrameMaxAlign();
+
+ // Adjust to alignment boundary.
+ Offset = RoundUpToAlignment(Offset, Align, Skew);
+
+ DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+ // Resolve offsets for objects in the local block.
+ for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+ std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+ int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+ DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+ FIOffset << "]\n");
+ MFI->setObjectOffset(Entry.first, FIOffset);
+ }
+ // Allocate the local block
+ Offset += MFI->getLocalFrameSize();
+
+ MaxAlign = std::max(Align, MaxAlign);
+ }
+
+ // Make sure that the stack protector comes before the local variables on the
+ // stack.
+ SmallSet<int, 16> ProtectedObjs;
+ if (MFI->getStackProtectorIndex() >= 0) {
+ StackObjSet LargeArrayObjs;
+ StackObjSet SmallArrayObjs;
+ StackObjSet AddrOfObjs;
+
+ AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
+ Offset, MaxAlign, Skew);
+
+ // Assign large stack objects first.
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+ if (MFI->isObjectPreAllocated(i) &&
+ MFI->getUseLocalStackAllocationBlock())
+ continue;
+ if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+ continue;
+ if (RS && RS->isScavengingFrameIndex((int)i))
+ continue;
+ if (MFI->isDeadObjectIndex(i))
+ continue;
+ if (MFI->getStackProtectorIndex() == (int)i)
+ continue;
+
+ switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
+ case StackProtector::SSPLK_None:
+ continue;
+ case StackProtector::SSPLK_SmallArray:
+ SmallArrayObjs.insert(i);
+ continue;
+ case StackProtector::SSPLK_AddrOf:
+ AddrOfObjs.insert(i);
+ continue;
+ case StackProtector::SSPLK_LargeArray:
+ LargeArrayObjs.insert(i);
+ continue;
+ }
+ llvm_unreachable("Unexpected SSPLayoutKind.");
+ }
+
+ AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+ Offset, MaxAlign, Skew);
+ AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+ Offset, MaxAlign, Skew);
+ AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+ Offset, MaxAlign, Skew);
+ }
+
+ // Then assign frame offsets to stack objects that are not used to spill
+ // callee saved registers.
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+ if (MFI->isObjectPreAllocated(i) &&
+ MFI->getUseLocalStackAllocationBlock())
+ continue;
+ if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+ continue;
+ if (RS && RS->isScavengingFrameIndex((int)i))
+ continue;
+ if (MFI->isDeadObjectIndex(i))
+ continue;
+ if (MFI->getStackProtectorIndex() == (int)i)
+ continue;
+ if (ProtectedObjs.count(i))
+ continue;
+
+ AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+ }
+
+ // Make sure the special register scavenging spill slot is closest to the
+ // stack pointer.
+ if (RS && !EarlyScavengingSlots) {
+ SmallVector<int, 2> SFIs;
+ RS->getScavengingFrameIndices(SFIs);
+ for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+ IE = SFIs.end(); I != IE; ++I)
+ AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+ }
+
+ if (!TFI.targetHandlesStackFrameRounding()) {
+ // If we have reserved argument space for call sites in the function
+ // immediately on entry to the current function, count it as part of the
+ // overall stack size.
+ if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+ Offset += MFI->getMaxCallFrameSize();
+
+ // Round up the size to a multiple of the alignment. If the function has
+ // any calls or alloca's, align to the target's StackAlignment value to
+ // ensure that the callee's frame or the alloca data is suitably aligned;
+ // otherwise, for leaf functions, align to the TransientStackAlignment
+ // value.
+ unsigned StackAlign;
+ if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+ (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+ StackAlign = TFI.getStackAlignment();
+ else
+ StackAlign = TFI.getTransientStackAlignment();
+
+ // If the frame pointer is eliminated, all frame offsets will be relative to
+ // SP not FP. Align to MaxAlign so this works.
+ StackAlign = std::max(StackAlign, MaxAlign);
+ Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
+ }
+
+ // Update frame info to pretend that this is part of the stack...
+ int64_t StackSize = Offset - LocalAreaOffset;
+ MFI->setStackSize(StackSize);
+ NumBytesStackSpace += StackSize;
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+
+ // Add prologue to the function...
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ TFI.emitPrologue(Fn, *SaveBlock);
+
+ // Add epilogue to restore the callee-save registers in each exiting block.
+ for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+ TFI.emitEpilogue(Fn, *RestoreBlock);
+
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ TFI.inlineStackProbe(Fn, *SaveBlock);
+
+ // Emit additional code that is required to support segmented stacks, if
+ // we've been asked for it. This, when linked with a runtime with support
+ // for segmented stacks (libgcc is one), will result in allocating stack
+ // space in small chunks instead of one large contiguous block.
+ if (Fn.shouldSplitStack()) {
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+ }
+
+ // Emit additional code that is required to explicitly handle the stack in
+ // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+ // approach is rather similar to that of Segmented Stacks, but it uses a
+ // different conditional check and another BIF for allocating more stack
+ // space.
+ if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void WasmPEI::replaceFrameIndices(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+ if (!TFI.needsFrameIndexResolution(Fn)) return;
+
+ // Store SPAdj at exit of a basic block.
+ SmallVector<int, 8> SPState;
+ SPState.resize(Fn.getNumBlockIDs());
+ SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+ // Iterate over the reachable blocks in DFS order.
+ for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+ DFI != DFE; ++DFI) {
+ int SPAdj = 0;
+ // Check the exit state of the DFS stack predecessor.
+ if (DFI.getPathLength() >= 2) {
+ MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+ assert(Reachable.count(StackPred) &&
+ "DFS stack predecessor is already visited.\n");
+ SPAdj = SPState[StackPred->getNumber()];
+ }
+ MachineBasicBlock *BB = *DFI;
+ replaceFrameIndices(BB, Fn, SPAdj);
+ SPState[BB->getNumber()] = SPAdj;
+ }
+
+ // Handle the unreachable blocks.
+ for (auto &BB : Fn) {
+ if (Reachable.count(&BB))
+ // Already handled in DFS traversal.
+ continue;
+ int SPAdj = 0;
+ replaceFrameIndices(&BB, Fn, SPAdj);
+ }
+}
+
+void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+ int &SPAdj) {
+ assert(Fn.getSubtarget().getRegisterInfo() &&
+ "getRegisterInfo() must be implemented!");
+ const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+ const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
+ const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+ unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+ if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+
+ bool InsideCallSequence = false;
+
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+
+ if (I->getOpcode() == FrameSetupOpcode ||
+ I->getOpcode() == FrameDestroyOpcode) {
+ InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+ SPAdj += TII.getSPAdjust(I);
+
+ MachineBasicBlock::iterator PrevI = BB->end();
+ if (I != BB->begin()) PrevI = std::prev(I);
+ TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+
+ // Visit the instructions created by eliminateCallFramePseudoInstr().
+ if (PrevI == BB->end())
+ I = BB->begin(); // The replaced instr was the first in the block.
+ else
+ I = std::next(PrevI);
+ continue;
+ }
+
+ MachineInstr *MI = I;
+ bool DoIncr = true;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (!MI->getOperand(i).isFI())
+ continue;
+
+ // Frame indices in debug values are encoded in a target independent
+ // way with simply the frame index and offset rather than any
+ // target-specific addressing mode.
+ if (MI->isDebugValue()) {
+ assert(i == 0 && "Frame indices can only appear as the first "
+ "operand of a DBG_VALUE machine instruction");
+ unsigned Reg;
+ MachineOperand &Offset = MI->getOperand(1);
+ Offset.setImm(Offset.getImm() +
+ TFI->getFrameIndexReference(
+ Fn, MI->getOperand(0).getIndex(), Reg));
+ MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+ continue;
+ }
+
+ // TODO: This code should be commoned with the code for
+ // PATCHPOINT. There's no good reason for the difference in
+ // implementation other than historical accident. The only
+ // remaining difference is the unconditional use of the stack
+ // pointer as the base register.
+ if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+ assert((!MI->isDebugValue() || i == 0) &&
+ "Frame indicies can only appear as the first operand of a "
+ "DBG_VALUE machine instruction");
+ unsigned Reg;
+ MachineOperand &Offset = MI->getOperand(i + 1);
+ const unsigned refOffset =
+ TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
+ Reg);
+
+ Offset.setImm(Offset.getImm() + refOffset);
+ MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+ continue;
+ }
+
+ // Some instructions (e.g. inline asm instructions) can have
+ // multiple frame indices and/or cause eliminateFrameIndex
+ // to insert more than one instruction. We need the register
+ // scavenger to go through all of these instructions so that
+ // it can update its register information. We keep the
+ // iterator at the point before insertion so that we can
+ // revisit them in full.
+ bool AtBeginning = (I == BB->begin());
+ if (!AtBeginning) --I;
+
+ // If this instruction has a FrameIndex operand, we need to
+ // use that target machine register info object to eliminate
+ // it.
+ TRI.eliminateFrameIndex(MI, SPAdj, i,
+ FrameIndexVirtualScavenging ? nullptr : RS);
+
+ // Reset the iterator if we were at the beginning of the BB.
+ if (AtBeginning) {
+ I = BB->begin();
+ DoIncr = false;
+ }
+
+ MI = nullptr;
+ break;
+ }
+
+ // If we are looking at a call sequence, we need to keep track of
+ // the SP adjustment made by each instruction in the sequence.
+ // This includes both the frame setup/destroy pseudos (handled above),
+ // as well as other instructions that have side effects w.r.t the SP.
+ // Note that this must come after eliminateFrameIndex, because
+ // if I itself referred to a frame index, we shouldn't count its own
+ // adjustment.
+ if (MI && InsideCallSequence)
+ SPAdj += TII.getSPAdjust(MI);
+
+ if (DoIncr && I != BB->end()) ++I;
+
+ // Update register states.
+ if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+ }
+}
+
+/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// with physical registers. Use the register scavenger to find an
+/// appropriate register to use.
+///
+/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
+/// iterate over the vreg use list, which at this point only contains machine
+/// operands for which eliminateFrameIndex need a new scratch reg.
+void
+WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+ // Run through the instructions and find any virtual registers.
+ for (MachineFunction::iterator BB = Fn.begin(),
+ E = Fn.end(); BB != E; ++BB) {
+ RS->enterBasicBlock(&*BB);
+
+ int SPAdj = 0;
+
+ // The instruction stream may change in the loop, so check BB->end()
+ // directly.
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+ // We might end up here again with a NULL iterator if we scavenged a
+ // register for which we inserted spill code for definition by what was
+ // originally the first instruction in BB.
+ if (I == MachineBasicBlock::iterator(nullptr))
+ I = BB->begin();
+
+ MachineInstr *MI = I;
+ MachineBasicBlock::iterator J = std::next(I);
+ MachineBasicBlock::iterator P =
+ I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
+ : std::prev(I);
+
+ // RS should process this instruction before we might scavenge at this
+ // location. This is because we might be replacing a virtual register
+ // defined by this instruction, and if so, registers killed by this
+ // instruction are available, and defined registers are not.
+ RS->forward(I);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (MI->getOperand(i).isReg()) {
+ MachineOperand &MO = MI->getOperand(i);
+ unsigned Reg = MO.getReg();
+ if (Reg == 0)
+ continue;
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+
+ // When we first encounter a new virtual register, it
+ // must be a definition.
+ assert(MI->getOperand(i).isDef() &&
+ "frame index virtual missing def!");
+ // Scavenge a new scratch register
+ const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
+ unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
+
+ ++NumScavengedRegs;
+
+ // Replace this reference to the virtual register with the
+ // scratch register.
+ assert (ScratchReg && "Missing scratch register!");
+ Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
+
+ // Because this instruction was processed by the RS before this
+ // register was allocated, make sure that the RS now records the
+ // register as being used.
+ RS->setRegUsed(ScratchReg);
+ }
+ }
+
+ // If the scavenger needed to use one of its spill slots, the
+ // spill code will have been inserted in between I and J. This is a
+ // problem because we need the spill code before I: Move I to just
+ // prior to J.
+ if (I != std::prev(J)) {
+ BB->splice(J, &*BB, I);
+
+ // Before we move I, we need to prepare the RS to visit I again.
+ // Specifically, RS will assert if it sees uses of registers that
+ // it believes are undefined. Because we have already processed
+ // register kills in I, when it visits I again, it will believe that
+ // those registers are undefined. To avoid this situation, unprocess
+ // the instruction I.
+ assert(RS->getCurrentPosition() == I &&
+ "The register scavenger has an unexpected position");
+ I = P;
+ RS->unprocess(P);
+ } else
+ ++I;
+ }
+ }
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
new file mode 100644
index 000000000000..4ad6eed7385b
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -0,0 +1,86 @@
+//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Late peephole optimizations for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-peephole"
+
+namespace {
+class WebAssemblyPeephole final : public MachineFunctionPass {
+ const char *getPassName() const override {
+ return "WebAssembly late peephole optimizer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID;
+ WebAssemblyPeephole() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyPeephole::ID = 0;
+FunctionPass *llvm::createWebAssemblyPeephole() {
+ return new WebAssemblyPeephole();
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case WebAssembly::STORE8_I32:
+ case WebAssembly::STORE16_I32:
+ case WebAssembly::STORE8_I64:
+ case WebAssembly::STORE16_I64:
+ case WebAssembly::STORE32_I64:
+ case WebAssembly::STORE_F32:
+ case WebAssembly::STORE_F64:
+ case WebAssembly::STORE_I32:
+ case WebAssembly::STORE_I64: {
+ // Store instructions return their value operand. If we ended up using
+ // the same register for both, replace it with a dead def so that it
+ // can use $discard instead.
+ MachineOperand &MO = MI.getOperand(0);
+ unsigned OldReg = MO.getReg();
+ // TODO: Handle SP/physregs
+ if (OldReg == MI.getOperand(3).getReg()
+ && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) {
+ Changed = true;
+ unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+ MO.setReg(NewReg);
+ MO.setIsDead();
+ MFI.stackifyVReg(NewReg);
+ MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg);
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
new file mode 100644
index 000000000000..9ec66595d8da
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -0,0 +1,175 @@
+//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a virtual register coloring pass.
+///
+/// WebAssembly doesn't have a fixed number of registers, but it is still
+/// desirable to minimize the total number of registers used in each function.
+///
+/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-coloring"
+
+namespace {
+class WebAssemblyRegColoring final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegColoring() : MachineFunctionPass(ID) {}
+
+ const char *getPassName() const override {
+ return "WebAssembly Register Coloring";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyRegColoring::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegColoring() {
+ return new WebAssemblyRegColoring();
+}
+
+// Compute the total spill weight for VReg.
+static float computeWeight(const MachineRegisterInfo *MRI,
+ const MachineBlockFrequencyInfo *MBFI,
+ unsigned VReg) {
+ float weight = 0.0f;
+ for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
+ weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
+ MO.getParent());
+ return weight;
+}
+
+bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Register Coloring **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual
+ // registers could be modified before the longjmp is executed, resulting in
+ // the wrong value being used afterwards. (See <rdar://problem/8007500>.)
+ // TODO: Does WebAssembly need to care about setjmp for register coloring?
+ if (MF.exposesReturnsTwice())
+ return false;
+
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ LiveIntervals *Liveness = &getAnalysis<LiveIntervals>();
+ const MachineBlockFrequencyInfo *MBFI =
+ &getAnalysis<MachineBlockFrequencyInfo>();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+ // Gather all register intervals into a list and sort them.
+ unsigned NumVRegs = MRI->getNumVirtRegs();
+ SmallVector<LiveInterval *, 0> SortedIntervals;
+ SortedIntervals.reserve(NumVRegs);
+
+ DEBUG(dbgs() << "Interesting register intervals:\n");
+ for (unsigned i = 0; i < NumVRegs; ++i) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
+ if (MFI.isVRegStackified(VReg))
+ continue;
+ // Skip unused registers, which can use $discard.
+ if (MRI->use_empty(VReg))
+ continue;
+
+ LiveInterval *LI = &Liveness->getInterval(VReg);
+ assert(LI->weight == 0.0f);
+ LI->weight = computeWeight(MRI, MBFI, VReg);
+ DEBUG(LI->dump());
+ SortedIntervals.push_back(LI);
+ }
+ DEBUG(dbgs() << '\n');
+
+ // Sort them to put arguments first (since we don't want to rename live-in
+ // registers), by weight next, and then by position.
+ // TODO: Investigate more intelligent sorting heuristics. For starters, we
+ // should try to coalesce adjacent live intervals before non-adjacent ones.
+ std::sort(SortedIntervals.begin(), SortedIntervals.end(),
+ [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+ if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+ return MRI->isLiveIn(LHS->reg);
+ if (LHS->weight != RHS->weight)
+ return LHS->weight > RHS->weight;
+ if (LHS->empty() || RHS->empty())
+ return !LHS->empty() && RHS->empty();
+ return *LHS < *RHS;
+ });
+
+ DEBUG(dbgs() << "Coloring register intervals:\n");
+ SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
+ SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
+ SortedIntervals.size());
+ BitVector UsedColors(SortedIntervals.size());
+ bool Changed = false;
+ for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+ LiveInterval *LI = SortedIntervals[i];
+ unsigned Old = LI->reg;
+ size_t Color = i;
+ const TargetRegisterClass *RC = MRI->getRegClass(Old);
+
+ // Check if it's possible to reuse any of the used colors.
+ if (!MRI->isLiveIn(Old))
+ for (int C(UsedColors.find_first()); C != -1;
+ C = UsedColors.find_next(C)) {
+ if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+ continue;
+ for (LiveInterval *OtherLI : Assignments[C])
+ if (!OtherLI->empty() && OtherLI->overlaps(*LI))
+ goto continue_outer;
+ Color = C;
+ break;
+ continue_outer:;
+ }
+
+ unsigned New = SortedIntervals[Color]->reg;
+ SlotMapping[i] = New;
+ Changed |= Old != New;
+ UsedColors.set(Color);
+ Assignments[Color].push_back(LI);
+ DEBUG(dbgs() << "Assigning vreg"
+ << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
+ << TargetRegisterInfo::virtReg2Index(New) << "\n");
+ }
+ if (!Changed)
+ return false;
+
+ // Rewrite register operands.
+ for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+ unsigned Old = SortedIntervals[i]->reg;
+ unsigned New = SlotMapping[i];
+ if (Old != New)
+ MRI->replaceRegWith(Old, New);
+ }
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
new file mode 100644
index 000000000000..f621db070b5b
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -0,0 +1,109 @@
+//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass which assigns WebAssembly register
+/// numbers for CodeGen virtual registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-numbering"
+
+namespace {
+class WebAssemblyRegNumbering final : public MachineFunctionPass {
+ const char *getPassName() const override {
+ return "WebAssembly Register Numbering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegNumbering() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegNumbering::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegNumbering() {
+ return new WebAssemblyRegNumbering();
+}
+
+bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Register Numbering **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MachineFrameInfo &FrameInfo = *MF.getFrameInfo();
+
+ MFI.initWARegs();
+
+ // WebAssembly argument registers are in the same index space as local
+ // variables. Assign the numbers for them first.
+ MachineBasicBlock &EntryMBB = MF.front();
+ for (MachineInstr &MI : EntryMBB) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::ARGUMENT_I32:
+ case WebAssembly::ARGUMENT_I64:
+ case WebAssembly::ARGUMENT_F32:
+ case WebAssembly::ARGUMENT_F64:
+ MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm());
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Then assign regular WebAssembly registers for all remaining used
+ // virtual registers. TODO: Consider sorting the registers by frequency of
+ // use, to maximize usage of small immediate fields.
+ unsigned NumArgRegs = MFI.getParams().size();
+ unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
+ unsigned NumStackRegs = 0;
+ unsigned CurReg = 0;
+ for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
+ unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+ // Handle stackified registers.
+ if (MFI.isVRegStackified(VReg)) {
+ MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
+ continue;
+ }
+ // Skip unused registers.
+ if (MRI.use_empty(VReg))
+ continue;
+ if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg)
+ MFI.setWAReg(VReg, NumArgRegs + CurReg++);
+ }
+ // Allocate locals for used physical registers
+ if (FrameInfo.getStackSize() > 0)
+ MFI.addPReg(WebAssembly::SP32, CurReg++);
+
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
new file mode 100644
index 000000000000..89ef5cdb2bef
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -0,0 +1,265 @@
+//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a register stacking pass.
+///
+/// This pass reorders instructions to put register uses and defs in an order
+/// such that they form single-use expression trees. Registers fitting this form
+/// are then marked as "stackified", meaning references to them are replaced by
+/// "push" and "pop" from the stack.
+///
+/// This is primarily a code size optimization, since temporary values on the
+/// expression don't need to be named.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-stackify"
+
+namespace {
+class WebAssemblyRegStackify final : public MachineFunctionPass {
+ const char *getPassName() const override {
+ return "WebAssembly Register Stackify";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(MachineDominatorsID);
+ AU.addPreservedID(LiveVariablesID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegStackify() {
+ return new WebAssemblyRegStackify();
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI) {
+ // Write the opaque EXPR_STACK register.
+ if (!MI->definesRegister(WebAssembly::EXPR_STACK))
+ MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+ /*isDef=*/true,
+ /*isImp=*/true));
+
+ // Also read the opaque EXPR_STACK register.
+ if (!MI->readsRegister(WebAssembly::EXPR_STACK))
+ MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+ /*isDef=*/false,
+ /*isImp=*/true));
+}
+
+// Test whether it's safe to move Def to just before Insert.
+// TODO: Compute memory dependencies in a way that doesn't require always
+// walking the block.
+// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
+// more precise.
+static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
+ AliasAnalysis &AA, LiveIntervals &LIS,
+ MachineRegisterInfo &MRI) {
+ assert(Def->getParent() == Insert->getParent());
+ bool SawStore = false, SawSideEffects = false;
+ MachineBasicBlock::const_iterator D(Def), I(Insert);
+
+ // Check for register dependencies.
+ for (const MachineOperand &MO : Def->operands()) {
+ if (!MO.isReg() || MO.isUndef())
+ continue;
+ unsigned Reg = MO.getReg();
+
+ // If the register is dead here and at Insert, ignore it.
+ if (MO.isDead() && Insert->definesRegister(Reg) &&
+ !Insert->readsRegister(Reg))
+ continue;
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ // If the physical register is never modified, ignore it.
+ if (!MRI.isPhysRegModified(Reg))
+ continue;
+ // Otherwise, it's a physical register with unknown liveness.
+ return false;
+ }
+
+ // Ask LiveIntervals whether moving this virtual register use or def to
+ // Insert will change value numbers are seen.
+ const LiveInterval &LI = LIS.getInterval(Reg);
+ VNInfo *DefVNI = MO.isDef() ?
+ LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) :
+ LI.getVNInfoBefore(LIS.getInstructionIndex(Def));
+ assert(DefVNI && "Instruction input missing value number");
+ VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert));
+ if (InsVNI && DefVNI != InsVNI)
+ return false;
+ }
+
+ // Check for memory dependencies and side effects.
+ for (--I; I != D; --I)
+ SawSideEffects |= I->isSafeToMove(&AA, SawStore);
+ return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) &&
+ !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
+}
+
+bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** Register Stackifying **********\n"
+ "********** Function: "
+ << MF.getName() << '\n');
+
+ bool Changed = false;
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+ // Walk the instructions from the bottom up. Currently we don't look past
+ // block boundaries, and the blocks aren't ordered so the block visitation
+ // order isn't significant, but we may want to change this in the future.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : reverse(MBB)) {
+ MachineInstr *Insert = &MI;
+ // Don't nest anything inside a phi.
+ if (Insert->getOpcode() == TargetOpcode::PHI)
+ break;
+
+ // Don't nest anything inside an inline asm, because we don't have
+ // constraints for $push inputs.
+ if (Insert->getOpcode() == TargetOpcode::INLINEASM)
+ break;
+
+ // Iterate through the inputs in reverse order, since we'll be pulling
+ // operands off the stack in LIFO order.
+ bool AnyStackified = false;
+ for (MachineOperand &Op : reverse(Insert->uses())) {
+ // We're only interested in explicit virtual register operands.
+ if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
+ continue;
+
+ unsigned Reg = Op.getReg();
+
+ // Only consider registers with a single definition.
+ // TODO: Eventually we may relax this, to stackify phi transfers.
+ MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (!Def)
+ continue;
+
+ // There's no use in nesting implicit defs inside anything.
+ if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+ continue;
+
+ // Don't nest an INLINE_ASM def into anything, because we don't have
+ // constraints for $pop outputs.
+ if (Def->getOpcode() == TargetOpcode::INLINEASM)
+ continue;
+
+ // Don't nest PHIs inside of anything.
+ if (Def->getOpcode() == TargetOpcode::PHI)
+ continue;
+
+ // Argument instructions represent live-in registers and not real
+ // instructions.
+ if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
+ Def->getOpcode() == WebAssembly::ARGUMENT_I64 ||
+ Def->getOpcode() == WebAssembly::ARGUMENT_F32 ||
+ Def->getOpcode() == WebAssembly::ARGUMENT_F64)
+ continue;
+
+ // Single-use expression trees require defs that have one use.
+ // TODO: Eventually we'll relax this, to take advantage of set_local
+ // returning its result.
+ if (!MRI.hasOneUse(Reg))
+ continue;
+
+ // For now, be conservative and don't look across block boundaries.
+ // TODO: Be more aggressive?
+ if (Def->getParent() != &MBB)
+ continue;
+
+ // Don't move instructions that have side effects or memory dependencies
+ // or other complications.
+ if (!IsSafeToMove(Def, Insert, AA, LIS, MRI))
+ continue;
+
+ Changed = true;
+ AnyStackified = true;
+ // Move the def down and nest it in the current instruction.
+ MBB.splice(Insert, &MBB, Def);
+ LIS.handleMove(Def);
+ MFI.stackifyVReg(Reg);
+ ImposeStackOrdering(Def);
+ Insert = Def;
+ }
+ if (AnyStackified)
+ ImposeStackOrdering(&MI);
+ }
+ }
+
+ // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
+ // so that it never looks like a use-before-def.
+ if (Changed) {
+ MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
+ for (MachineBasicBlock &MBB : MF)
+ MBB.addLiveIn(WebAssembly::EXPR_STACK);
+ }
+
+#ifndef NDEBUG
+ // Verify that pushes and pops are performed in FIFO order.
+ SmallVector<unsigned, 0> Stack;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+ if (!MO.isReg())
+ continue;
+ unsigned VReg = MO.getReg();
+
+ // Don't stackify physregs like SP or FP.
+ if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ continue;
+
+ if (MFI.isVRegStackified(VReg)) {
+ if (MO.isDef())
+ Stack.push_back(VReg);
+ else
+ assert(Stack.pop_back_val() == VReg);
+ }
+ }
+ }
+ // TODO: Generalize this code to support keeping values on the stack across
+ // basic block boundaries.
+ assert(Stack.empty());
+ }
+#endif
+
+ return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 385c40bf6693..dcada45f96d1 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -43,7 +43,7 @@ WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
}
BitVector
-WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
BitVector Reserved(getNumRegs());
for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
WebAssembly::FP64})
@@ -52,9 +52,37 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
void WebAssemblyRegisterInfo::eliminateFrameIndex(
- MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
- RegScavenger *RS) const {
- llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME
+ MachineBasicBlock::iterator II, int SPAdj,
+ unsigned FIOperandNum, RegScavenger * /*RS*/) const {
+ assert(SPAdj == 0);
+ MachineInstr &MI = *II;
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ const MachineFrameInfo& MFI = *MF.getFrameInfo();
+ int FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
+
+ if (MI.mayLoadOrStore()) {
+ // If this is a load or store, make it relative to SP and fold the frame
+ // offset directly in
+ assert(MI.getOperand(1).getImm() == 0 &&
+ "Can't eliminate FI yet if offset is already set");
+ MI.getOperand(1).setImm(FrameOffset);
+ MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+ } else {
+ // Otherwise create an i32.add SP, offset and make it the operand
+ auto &MRI = MF.getRegInfo();
+ const auto *TII = MF.getSubtarget().getInstrInfo();
+
+ unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg)
+ .addImm(FrameOffset);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg)
+ .addReg(WebAssembly::SP32)
+ .addReg(OffsetReg);
+ MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false);
+ }
}
unsigned
@@ -67,21 +95,11 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
}
-bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const {
- return !MF.getFunction()->hasFnAttribute("no-realign-stack");
-}
-
-// FIXME: share this with other backends with identical implementation?
-bool WebAssemblyRegisterInfo::needsStackRealignment(
- const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment =
- ((MFI->getMaxAlignment() > StackAlign) ||
- F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::StackAlignment));
-
- return requiresRealignment && canRealignStack(MF);
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind) const {
+ assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+ if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+ return &WebAssembly::I64RegClass;
+ return &WebAssembly::I32RegClass;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index dbdb9d0457af..ad1d71eebf22 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -42,9 +42,9 @@ public:
// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const override;
- // Base pointer (stack realignment) support.
- bool canRealignStack(const MachineFunction &MF) const;
- bool needsStackRealignment(const MachineFunction &MF) const override;
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 2ba42eb94a40..80a83fa76b57 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -33,22 +33,26 @@ def FP64 : WebAssemblyReg<"%FP64">;
def SP32 : WebAssemblyReg<"%SP32">;
def SP64 : WebAssemblyReg<"%SP64">;
-// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do
-// away with it? Try deleting once the backend works.
-// WebAssembly uses virtual registers, but the backend defines a few physical
-// registers here to keep SDAG and the MachineInstr layers happy.
-foreach i = 0-4 in {
- def I#i : WebAssemblyReg<"%i."#i>; // i32
- def L#i : WebAssemblyReg<"%l."#i>; // i64
- def F#i : WebAssemblyReg<"%f."#i>; // f32
- def D#i : WebAssemblyReg<"%d."#i>; // f64
-}
+// The register allocation framework requires register classes have at least
+// one register, so we define a few for the floating point register classes
+// since we otherwise don't need a physical register in those classes.
+def F32_0 : WebAssemblyReg<"%f32.0">;
+def F64_0 : WebAssemblyReg<"%f64.0">;
+
+// The expression stack "register". This is an opaque entity which serves to
+// order uses and defs that must remain in LIFO order.
+def EXPR_STACK : WebAssemblyReg<"STACK">;
+
+// The incoming arguments "register". This is an opaque entity which serves to
+// order the ARGUMENT instructions that are emulating live-in registers and
+// must not be scheduled below other instructions.
+def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
//===----------------------------------------------------------------------===//
// Register classes
//===----------------------------------------------------------------------===//
-def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>;
-def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>;
-def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
-def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>;
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
+def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
new file mode 100644
index 000000000000..4e08b2b079eb
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -0,0 +1,124 @@
+//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements an optimization pass using store result values.
+///
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-store-results"
+
+namespace {
+class WebAssemblyStoreResults final : public MachineFunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WebAssemblyStoreResults() : MachineFunctionPass(ID) {}
+
+ const char *getPassName() const override {
+ return "WebAssembly Store Results";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyStoreResults::ID = 0;
+FunctionPass *llvm::createWebAssemblyStoreResults() {
+ return new WebAssemblyStoreResults();
+}
+
+bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG({
+ dbgs() << "********** Store Results **********\n"
+ << "********** Function: " << MF.getName() << '\n';
+ });
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+ bool Changed = false;
+
+ assert(MRI.isSSA() && "StoreResults depends on SSA form");
+
+ for (auto &MBB : MF) {
+ DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+ for (auto &MI : MBB)
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case WebAssembly::STORE8_I32:
+ case WebAssembly::STORE16_I32:
+ case WebAssembly::STORE8_I64:
+ case WebAssembly::STORE16_I64:
+ case WebAssembly::STORE32_I64:
+ case WebAssembly::STORE_F32:
+ case WebAssembly::STORE_F64:
+ case WebAssembly::STORE_I32:
+ case WebAssembly::STORE_I64:
+ unsigned ToReg = MI.getOperand(0).getReg();
+ unsigned FromReg = MI.getOperand(3).getReg();
+ for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
+ MachineOperand &O = *I++;
+ MachineInstr *Where = O.getParent();
+ if (Where->getOpcode() == TargetOpcode::PHI) {
+ // PHIs use their operands on their incoming CFG edges rather than
+ // in their parent blocks. Get the basic block paired with this use
+ // of FromReg and check that MI's block dominates it.
+ MachineBasicBlock *Pred =
+ Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
+ if (!MDT.dominates(&MBB, Pred))
+ continue;
+ } else {
+ // For a non-PHI, check that MI dominates the instruction in the
+ // normal way.
+ if (&MI == Where || !MDT.dominates(&MI, Where))
+ continue;
+ }
+ Changed = true;
+ DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
+ << " from " << MI << "\n");
+ O.setReg(ToReg);
+ // If the store's def was previously dead, it is no longer. But the
+ // dead flag shouldn't be set yet.
+ assert(!MI.getOperand(0).isDead() && "Dead flag set on store result");
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 3d9e7aacbfbf..cb2d5a63a19f 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
TLInfo(TM, *this) {}
bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 6f1761940930..f530a290fa0e 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -61,9 +61,15 @@ public:
const WebAssemblyTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
+ const WebAssemblyInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+ const WebAssemblyRegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override;
- bool useAA() const override { return true; }
+ bool useAA() const override;
// Predicates used by WebAssemblyInstrInfo.td.
bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 6f93248bd13c..e31ea46de9f5 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -45,11 +45,16 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL)
- : LLVMTargetMachine(T, TT.isArch64Bit()
- ? "e-p:64:64-i64:64-v128:8:128-n32:64-S128"
- : "e-p:32:32-i64:64-v128:8:128-n32:64-S128",
+ : LLVMTargetMachine(T, TT.isArch64Bit() ? "e-p:64:64-i64:64-n32:64-S128"
+ : "e-p:32:32-i64:64-n32:64-S128",
TT, CPU, FS, Options, RM, CM, OL),
TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+ // WebAssembly type-checks expressions, but a noreturn function with a return
+ // type that doesn't match the context will cause a check failure. So we lower
+ // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
+ // 'unreachable' expression which is meant for that case.
+ this->Options.TrapUnreachable = true;
+
initAsmInfo();
// We need a reducible CFG, so disable some optimizations which tend to
@@ -77,7 +82,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+ I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
}
return I.get();
}
@@ -94,23 +99,18 @@ public:
}
FunctionPass *createTargetRegisterAllocator(bool) override;
- void addFastRegAlloc(FunctionPass *RegAllocPass) override;
- void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addIRPasses() override;
- bool addPreISel() override;
bool addInstSelector() override;
bool addILPOpts() override;
void addPreRegAlloc() override;
- void addRegAllocPasses(bool Optimized);
void addPostRegAlloc() override;
- void addPreSched2() override;
void addPreEmitPass() override;
};
} // end anonymous namespace
TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
});
}
@@ -124,50 +124,86 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
return nullptr; // No reg alloc
}
-void WebAssemblyPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
- assert(!RegAllocPass && "WebAssembly uses no regalloc!");
- addRegAllocPasses(false);
-}
-
-void WebAssemblyPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- assert(!RegAllocPass && "WebAssembly uses no regalloc!");
- addRegAllocPasses(true);
-}
-
//===----------------------------------------------------------------------===//
// The following functions are called from lib/CodeGen/Passes.cpp to modify
// the CodeGen pass sequence.
//===----------------------------------------------------------------------===//
void WebAssemblyPassConfig::addIRPasses() {
- // FIXME: the default for this option is currently POSIX, whereas
- // WebAssembly's MVP should default to Single.
if (TM->Options.ThreadModel == ThreadModel::Single)
+ // In "single" mode, atomics get lowered to non-atomics.
addPass(createLowerAtomicPass());
else
// Expand some atomic operations. WebAssemblyTargetLowering has hooks which
// control specifically what gets lowered.
addPass(createAtomicExpandPass(TM));
+ // Optimize "returned" function attributes.
+ addPass(createWebAssemblyOptimizeReturned());
+
TargetPassConfig::addIRPasses();
}
-bool WebAssemblyPassConfig::addPreISel() { return false; }
-
bool WebAssemblyPassConfig::addInstSelector() {
+ (void)TargetPassConfig::addInstSelector();
addPass(
createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+ // Run the argument-move pass immediately after the ScheduleDAG scheduler
+ // so that we can fix up the ARGUMENT instructions before anything else
+ // sees them in the wrong place.
+ addPass(createWebAssemblyArgumentMove());
return false;
}
-bool WebAssemblyPassConfig::addILPOpts() { return true; }
+bool WebAssemblyPassConfig::addILPOpts() {
+ (void)TargetPassConfig::addILPOpts();
+ return true;
+}
+
+void WebAssemblyPassConfig::addPreRegAlloc() {
+ TargetPassConfig::addPreRegAlloc();
-void WebAssemblyPassConfig::addPreRegAlloc() {}
+ // Prepare store instructions for register stackifying.
+ addPass(createWebAssemblyStoreResults());
+}
-void WebAssemblyPassConfig::addRegAllocPasses(bool Optimized) {}
+void WebAssemblyPassConfig::addPostRegAlloc() {
+ // TODO: The following CodeGen passes don't currently support code containing
+ // virtual registers. Consider removing their restrictions and re-enabling
+ // them.
+ //
+ // We use our own PrologEpilogInserter which is very slightly modified to
+ // tolerate virtual registers.
+ disablePass(&PrologEpilogCodeInserterID);
+ // Fails with: should be run after register allocation.
+ disablePass(&MachineCopyPropagationID);
+
+ // Mark registers as representing wasm's expression stack.
+ addPass(createWebAssemblyRegStackify());
+
+ // Run the register coloring pass to reduce the total number of registers.
+ addPass(createWebAssemblyRegColoring());
+
+ TargetPassConfig::addPostRegAlloc();
+
+ // Run WebAssembly's version of the PrologEpilogInserter. Target-independent
+ // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run
+ // PEI before ShrinkWrap but otherwise in the same position in the order.
+ addPass(createWebAssemblyPEI());
+}
-void WebAssemblyPassConfig::addPostRegAlloc() {}
+void WebAssemblyPassConfig::addPreEmitPass() {
+ TargetPassConfig::addPreEmitPass();
-void WebAssemblyPassConfig::addPreSched2() {}
+ // Put the CFG in structured form; insert BLOCK and LOOP markers.
+ addPass(createWebAssemblyCFGStackify());
-void WebAssemblyPassConfig::addPreEmitPass() {}
+ // Lower br_unless into br_if.
+ addPass(createWebAssemblyLowerBrUnless());
+
+ // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
+ addPass(createWebAssemblyRegNumbering());
+
+ // Perform the very last peephole optimizations on the code.
+ addPass(createWebAssemblyPeephole());
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
new file mode 100644
index 000000000000..74e33b93e00d
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// of TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index ee78b945ada2..39e50c9c575d 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -16,50 +16,13 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
-#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
namespace llvm {
-class GlobalVariable;
-
-class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFile {
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
public:
- WebAssemblyTargetObjectFile() {
- TextSection = nullptr;
- DataSection = nullptr;
- BSSSection = nullptr;
- ReadOnlySection = nullptr;
-
- StaticCtorSection = nullptr;
- StaticDtorSection = nullptr;
- LSDASection = nullptr;
- EHFrameSection = nullptr;
- DwarfAbbrevSection = nullptr;
- DwarfInfoSection = nullptr;
- DwarfLineSection = nullptr;
- DwarfFrameSection = nullptr;
- DwarfPubTypesSection = nullptr;
- DwarfDebugInlineSection = nullptr;
- DwarfStrSection = nullptr;
- DwarfLocSection = nullptr;
- DwarfARangesSection = nullptr;
- DwarfRangesSection = nullptr;
- }
-
- MCSection *getSectionForConstant(SectionKind Kind,
- const Constant *C) const override {
- return ReadOnlySection;
- }
-
- MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
- Mangler &Mang,
- const TargetMachine &TM) const override {
- return DataSection;
- }
-
- MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
- Mangler &Mang,
- const TargetMachine &TM) const override;
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index fa88ed526df2..356631711921 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -21,8 +21,7 @@ using namespace llvm;
#define DEBUG_TYPE "wasmtti"
TargetTransformInfo::PopcntSupportKind
-WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) {
+WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
- // TODO: Make Math.popcount32 happen in WebAssembly.
- return TTI::PSK_Software;
+ return TargetTransformInfo::PSK_FastHardware;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 7ffb6047b963..26dc388cc922 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -38,7 +38,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
const WebAssemblyTargetLowering *getTLI() const { return TLI; }
public:
- WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F)
+ WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -54,7 +54,7 @@ public:
// TODO: Implement more Scalar TTI for WebAssembly
- TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
/// @}
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 000000000000..ee9d060f339e
--- /dev/null
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,311 @@
+# Tests which are known to fail from the GCC torture test suite.
+
+# Core dump.
+920908-1.c
+pr38151.c
+va-arg-22.c
+
+# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
+struct-ret-1.c
+va-arg-11.c
+va-arg-21.c
+va-arg-24.c
+va-arg-trap-1.c
+
+# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
+20000815-1.c
+20010129-1.c
+930628-1.c
+980707-1.c
+
+# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
+20030914-2.c
+20040703-1.c
+20081117-1.c
+920625-1.c
+931004-11.c
+931004-13.c
+980223.c
+bitfld-5.c
+complex-7.c
+pr38969.c
+pr51323.c
+pr52129.c
+pr57130.c
+
+# These were previously "Cannot select FrameIndex." Now most of them fail
+# because they contain call frame pseudos (e.g. call a vararg func),
+# frame pointers, or similar. This list will be updated again soon.
+20000519-1.c
+20000706-4.c
+20000706-5.c
+20000801-2.c
+20000801-4.c
+20011126-2.c
+
+20020529-1.c
+20021024-1.c
+
+20030828-1.c
+20030914-1.c
+
+20040302-1.c
+20040625-1.c
+20040823-1.c
+
+20041113-1.c
+
+20041214-1.c
+
+20050826-2.c
+
+20071213-1.c
+
+20080506-2.c
+20080519-1.c
+
+20081103-1.c
+20090113-1.c
+20090113-2.c
+20090113-3.c
+
+20090623-1.c
+
+920501-6.c
+920501-8.c
+920726-1.c
+930518-1.c
+
+931004-10.c
+931004-12.c
+931004-14.c
+931004-2.c
+931004-4.c
+931004-6.c
+931004-8.c
+
+980205.c
+980608-1.c
+980709-1.c
+980716-1.c
+990127-1.c
+
+991216-2.c
+
+#cbrt.c
+complex-5.c
+complex-6.c
+
+enum-3.c
+fprintf-chk-1.c
+frame-address.c
+loop-15.c
+loop-ivopts-2.c
+mayalias-3.c
+
+multi-ix.c
+
+pr20466-1.c
+
+
+pr28778.c
+pr28982b.c
+
+pr30778.c
+pr31448-2.c
+pr31448.c
+
+pr33870-1.c
+pr33870.c
+
+pr38051.c
+
+pr39100.c
+
+pr39339.c
+pr40022.c
+pr40657.c
+
+pr43987.c
+
+pr44575.c
+
+pr44942.c
+pr46309.c
+pr47538.c
+pr47925.c
+
+pr49390.c
+pr49419.c
+
+#pr51877.c
+
+#pr52979-1.c
+#pr52979-2.c
+pr53645-2.c
+pr53645.c
+
+pr56205.c
+
+pr56866.c
+
+pr57876.c
+pr58277-1.c
+
+pr59643.c
+
+printf-chk-1.c
+pta-field-1.c
+pta-field-2.c
+
+stdarg-1.c
+stdarg-2.c
+stdarg-3.c
+stdarg-4.c
+strct-stdarg-1.c
+strct-varg-1.c
+
+va-arg-1.c
+va-arg-10.c
+va-arg-12.c
+va-arg-13.c
+va-arg-14.c
+va-arg-15.c
+va-arg-16.c
+va-arg-17.c
+va-arg-18.c
+va-arg-19.c
+va-arg-2.c
+va-arg-20.c
+va-arg-23.c
+va-arg-26.c
+va-arg-4.c
+va-arg-5.c
+va-arg-6.c
+va-arg-7.c
+va-arg-8.c
+va-arg-9.c
+va-arg-pack-1.c
+vfprintf-1.c
+vfprintf-chk-1.c
+vprintf-1.c
+vprintf-chk-1.c
+
+# Cannot select callseq_end.
+20040811-1.c
+pr43220.c
+vla-dealloc-1.c
+
+# Cannot select brind.
+20071210-1.c
+920501-4.c
+920501-5.c
+
+# Cannot select BlockAddress.
+comp-goto-1.c
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented byval arguments.
+20000412-3.c
+20000419-1.c
+20000706-1.c
+20000706-2.c
+20000707-1.c
+20000717-1.c
+20000717-5.c
+20000808-1.c
+20010605-2.c
+20011113-1.c
+20020215-1.c
+20020810-1.c
+20021118-1.c
+20040707-1.c
+20040709-1.c
+20040709-2.c
+20041201-1.c
+20050713-1.c
+20070614-1.c
+920908-2.c
+921112-1.c
+921117-1.c
+921123-2.c
+921204-1.c
+930126-1.c
+930208-1.c
+931004-5.c
+931004-9.c
+931031-1.c
+950607-2.c
+960416-1.c
+990525-1.c
+991118-1.c
+bf64-1.c
+complex-1.c
+complex-2.c
+pr15262-2.c
+pr20621-1.c
+pr23135.c
+pr30185.c
+pr42248.c
+
+# unimplemented operation lowering.
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+990826-0.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+20050107-1.c
+20050119-1.c
+20050119-2.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 2c1926edc26b..b022a41b192f 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -1,7 +1,4 @@
add_llvm_library(LLVMX86AsmParser
X86AsmInstrumentation.cpp
X86AsmParser.cpp
-
- LINK_LIBS
- LLVMX86CodeGen
)
diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt
index 284bfd052ccf..9f94d5d38864 100644
--- a/lib/Target/X86/AsmParser/LLVMBuild.txt
+++ b/lib/Target/X86/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = X86AsmParser
parent = X86
-required_libraries = MC MCParser Support X86CodeGen X86Desc X86Info
+required_libraries = MC MCParser Support X86Desc X86Info
add_to_library_groups = X86
diff --git a/lib/Target/X86/AsmParser/Makefile b/lib/Target/X86/AsmParser/Makefile
index fb9760796622..f834dfc300a1 100644
--- a/lib/Target/X86/AsmParser/Makefile
+++ b/lib/Target/X86/AsmParser/Makefile
@@ -9,7 +9,7 @@
LEVEL = ../../../..
LIBRARYNAME = LLVMX86AsmParser
-# Hack: we need to include 'main' x86 target directory to grab private headers
+# Hack: we need to include 'main' X86 target directory to grab private headers
CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index 9eee4a0f3d82..09cc53a8e6d3 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -10,10 +10,8 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "X86AsmInstrumentation.h"
#include "X86Operand.h"
-#include "X86RegisterInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
@@ -118,11 +116,6 @@ bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
-std::string FuncName(unsigned AccessSize, bool IsWrite) {
- return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
- utostr(AccessSize);
-}
-
class X86AddressSanitizer : public X86AsmInstrumentation {
public:
struct RegisterContext {
@@ -136,26 +129,26 @@ public:
public:
RegisterContext(unsigned AddressReg, unsigned ShadowReg,
unsigned ScratchReg) {
- BusyRegs.push_back(convReg(AddressReg, MVT::i64));
- BusyRegs.push_back(convReg(ShadowReg, MVT::i64));
- BusyRegs.push_back(convReg(ScratchReg, MVT::i64));
+ BusyRegs.push_back(convReg(AddressReg, 64));
+ BusyRegs.push_back(convReg(ShadowReg, 64));
+ BusyRegs.push_back(convReg(ScratchReg, 64));
}
- unsigned AddressReg(MVT::SimpleValueType VT) const {
- return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT);
+ unsigned AddressReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
}
- unsigned ShadowReg(MVT::SimpleValueType VT) const {
- return convReg(BusyRegs[REG_OFFSET_SHADOW], VT);
+ unsigned ShadowReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
}
- unsigned ScratchReg(MVT::SimpleValueType VT) const {
- return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT);
+ unsigned ScratchReg(unsigned Size) const {
+ return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
}
void AddBusyReg(unsigned Reg) {
if (Reg != X86::NoRegister)
- BusyRegs.push_back(convReg(Reg, MVT::i64));
+ BusyRegs.push_back(convReg(Reg, 64));
}
void AddBusyRegs(const X86Operand &Op) {
@@ -163,36 +156,36 @@ public:
AddBusyReg(Op.getMemIndexReg());
}
- unsigned ChooseFrameReg(MVT::SimpleValueType VT) const {
+ unsigned ChooseFrameReg(unsigned Size) const {
static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
X86::RCX, X86::RDX, X86::RDI,
X86::RSI };
for (unsigned Reg : Candidates) {
if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
- return convReg(Reg, VT);
+ return convReg(Reg, Size);
}
return X86::NoRegister;
}
private:
- unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const {
- return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT);
+ unsigned convReg(unsigned Reg, unsigned Size) const {
+ return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
}
std::vector<unsigned> BusyRegs;
};
- X86AddressSanitizer(const MCSubtargetInfo &STI)
+ X86AddressSanitizer(const MCSubtargetInfo *&STI)
: X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
- virtual ~X86AddressSanitizer() {}
+ ~X86AddressSanitizer() override {}
// X86AsmInstrumentation implementation:
- virtual void InstrumentAndEmitInstruction(const MCInst &Inst,
- OperandVector &Operands,
- MCContext &Ctx,
- const MCInstrInfo &MII,
- MCStreamer &Out) override {
+ void InstrumentAndEmitInstruction(const MCInst &Inst,
+ OperandVector &Operands,
+ MCContext &Ctx,
+ const MCInstrInfo &MII,
+ MCStreamer &Out) override {
InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
if (RepPrefix)
EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
@@ -240,17 +233,16 @@ public:
protected:
void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
- void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg,
- MCStreamer &Out) {
- assert(VT == MVT::i32 || VT == MVT::i64);
+ void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+ assert(Size == 32 || Size == 64);
MCInst Inst;
- Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r);
- Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT)));
+ Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+ Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
Op.addMemOperands(Inst, 5);
EmitInstruction(Out, Inst);
}
- void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT,
+ void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
unsigned Reg, MCContext &Ctx, MCStreamer &Out);
// Creates new memory operand with Displacement added to an original
@@ -261,13 +253,13 @@ protected:
MCContext &Ctx, int64_t *Residue);
bool is64BitMode() const {
- return STI.getFeatureBits()[X86::Mode64Bit];
+ return STI->getFeatureBits()[X86::Mode64Bit];
}
bool is32BitMode() const {
- return STI.getFeatureBits()[X86::Mode32Bit];
+ return STI->getFeatureBits()[X86::Mode32Bit];
}
bool is16BitMode() const {
- return STI.getFeatureBits()[X86::Mode16Bit];
+ return STI->getFeatureBits()[X86::Mode16Bit];
}
unsigned getPointerWidth() {
@@ -437,7 +429,7 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
}
void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
- MVT::SimpleValueType VT,
+ unsigned Size,
unsigned Reg, MCContext &Ctx,
MCStreamer &Out) {
int64_t Displacement = 0;
@@ -450,14 +442,14 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
// Emit Op as is.
if (Displacement == 0) {
- EmitLEA(Op, VT, Reg, Out);
+ EmitLEA(Op, Size, Reg, Out);
return;
}
int64_t Residue;
std::unique_ptr<X86Operand> NewOp =
AddDisplacement(Op, Displacement, Ctx, &Residue);
- EmitLEA(*NewOp, VT, Reg, Out);
+ EmitLEA(*NewOp, Size, Reg, Out);
while (Residue != 0) {
const MCConstantExpr *Disp =
@@ -465,7 +457,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
std::unique_ptr<X86Operand> DispOp =
X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
SMLoc());
- EmitLEA(*DispOp, VT, Reg, Out);
+ EmitLEA(*DispOp, Size, Reg, Out);
Residue -= Disp->getValue();
}
}
@@ -503,16 +495,16 @@ class X86AddressSanitizer32 : public X86AddressSanitizer {
public:
static const long kShadowOffset = 0x20000000;
- X86AddressSanitizer32(const MCSubtargetInfo &STI)
+ X86AddressSanitizer32(const MCSubtargetInfo *&STI)
: X86AddressSanitizer(STI) {}
- virtual ~X86AddressSanitizer32() {}
+ ~X86AddressSanitizer32() override {}
unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
if (FrameReg == X86::NoRegister)
return FrameReg;
- return getX86SubSuperRegister(FrameReg, MVT::i32);
+ return getX86SubSuperRegister(FrameReg, 32);
}
void SpillReg(MCStreamer &Out, unsigned Reg) {
@@ -535,10 +527,10 @@ public:
OrigSPOffset += 4;
}
- virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
assert(LocalFrameReg != X86::NoRegister);
const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
@@ -558,24 +550,24 @@ public:
MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
}
- SpillReg(Out, RegCtx.AddressReg(MVT::i32));
- SpillReg(Out, RegCtx.ShadowReg(MVT::i32));
- if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(MVT::i32));
+ SpillReg(Out, RegCtx.AddressReg(32));
+ SpillReg(Out, RegCtx.ShadowReg(32));
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(32));
StoreFlags(Out);
}
- virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
assert(LocalFrameReg != X86::NoRegister);
RestoreFlags(Out);
- if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(MVT::i32));
- RestoreReg(Out, RegCtx.ShadowReg(MVT::i32));
- RestoreReg(Out, RegCtx.AddressReg(MVT::i32));
+ if (RegCtx.ScratchReg(32) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(32));
+ RestoreReg(Out, RegCtx.ShadowReg(32));
+ RestoreReg(Out, RegCtx.AddressReg(32));
unsigned FrameReg = GetFrameReg(Ctx, Out);
if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
@@ -586,18 +578,18 @@ public:
}
}
- virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
private:
void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
@@ -610,10 +602,11 @@ private:
.addReg(X86::ESP)
.addImm(-16));
EmitInstruction(
- Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32)));
+ Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
- const std::string &Fn = FuncName(AccessSize, IsWrite);
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
const MCSymbolRefExpr *FnExpr =
MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
@@ -623,14 +616,14 @@ private:
void X86AddressSanitizer32::InstrumentMemOperandSmall(
X86Operand &Op, unsigned AccessSize, bool IsWrite,
const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
- assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
- ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
AddressRegI32));
@@ -673,7 +666,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
std::unique_ptr<X86Operand> Op(
X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
SMLoc(), SMLoc()));
- EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
break;
}
case 4:
@@ -698,10 +691,10 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
void X86AddressSanitizer32::InstrumentMemOperandLarge(
X86Operand &Op, unsigned AccessSize, bool IsWrite,
const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
- ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
+ ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
AddressRegI32));
@@ -760,16 +753,16 @@ class X86AddressSanitizer64 : public X86AddressSanitizer {
public:
static const long kShadowOffset = 0x7fff8000;
- X86AddressSanitizer64(const MCSubtargetInfo &STI)
+ X86AddressSanitizer64(const MCSubtargetInfo *&STI)
: X86AddressSanitizer(STI) {}
- virtual ~X86AddressSanitizer64() {}
+ ~X86AddressSanitizer64() override {}
unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
if (FrameReg == X86::NoRegister)
return FrameReg;
- return getX86SubSuperRegister(FrameReg, MVT::i64);
+ return getX86SubSuperRegister(FrameReg, 64);
}
void SpillReg(MCStreamer &Out, unsigned Reg) {
@@ -792,10 +785,10 @@ public:
OrigSPOffset += 8;
}
- virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+ void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
assert(LocalFrameReg != X86::NoRegister);
const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
@@ -816,24 +809,24 @@ public:
}
EmitAdjustRSP(Ctx, Out, -128);
- SpillReg(Out, RegCtx.ShadowReg(MVT::i64));
- SpillReg(Out, RegCtx.AddressReg(MVT::i64));
- if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
- SpillReg(Out, RegCtx.ScratchReg(MVT::i64));
+ SpillReg(Out, RegCtx.ShadowReg(64));
+ SpillReg(Out, RegCtx.AddressReg(64));
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ SpillReg(Out, RegCtx.ScratchReg(64));
StoreFlags(Out);
}
- virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override {
- unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+ void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override {
+ unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
assert(LocalFrameReg != X86::NoRegister);
RestoreFlags(Out);
- if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
- RestoreReg(Out, RegCtx.ScratchReg(MVT::i64));
- RestoreReg(Out, RegCtx.AddressReg(MVT::i64));
- RestoreReg(Out, RegCtx.ShadowReg(MVT::i64));
+ if (RegCtx.ScratchReg(64) != X86::NoRegister)
+ RestoreReg(Out, RegCtx.ScratchReg(64));
+ RestoreReg(Out, RegCtx.AddressReg(64));
+ RestoreReg(Out, RegCtx.ShadowReg(64));
EmitAdjustRSP(Ctx, Out, 128);
unsigned FrameReg = GetFrameReg(Ctx, Out);
@@ -845,18 +838,18 @@ public:
}
}
- virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
- bool IsWrite,
- const RegisterContext &RegCtx,
- MCContext &Ctx,
- MCStreamer &Out) override;
- virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
- MCStreamer &Out) override;
+ void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+ bool IsWrite,
+ const RegisterContext &RegCtx,
+ MCContext &Ctx,
+ MCStreamer &Out) override;
+ void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+ MCStreamer &Out) override;
private:
void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
@@ -864,7 +857,7 @@ private:
std::unique_ptr<X86Operand> Op(
X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
SMLoc(), SMLoc()));
- EmitLEA(*Op, MVT::i64, X86::RSP, Out);
+ EmitLEA(*Op, 64, X86::RSP, Out);
OrigSPOffset += Offset;
}
@@ -878,12 +871,13 @@ private:
.addReg(X86::RSP)
.addImm(-16));
- if (RegCtx.AddressReg(MVT::i64) != X86::RDI) {
+ if (RegCtx.AddressReg(64) != X86::RDI) {
EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
- RegCtx.AddressReg(MVT::i64)));
+ RegCtx.AddressReg(64)));
}
- const std::string &Fn = FuncName(AccessSize, IsWrite);
- MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
+ MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+ (IsWrite ? "store" : "load") +
+ llvm::Twine(AccessSize));
const MCSymbolRefExpr *FnExpr =
MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
@@ -893,16 +887,16 @@ private:
void X86AddressSanitizer64::InstrumentMemOperandSmall(
X86Operand &Op, unsigned AccessSize, bool IsWrite,
const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
- unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
- unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
- unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned AddressRegI32 = RegCtx.AddressReg(32);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+ unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+ unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
- assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
- unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
+ assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+ unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
- ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
AddressRegI64));
@@ -944,7 +938,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
std::unique_ptr<X86Operand> Op(
X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
SMLoc(), SMLoc()));
- EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
+ EmitLEA(*Op, 32, ScratchRegI32, Out);
break;
}
case 4:
@@ -969,10 +963,10 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
void X86AddressSanitizer64::InstrumentMemOperandLarge(
X86Operand &Op, unsigned AccessSize, bool IsWrite,
const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
- unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
- unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
+ unsigned AddressRegI64 = RegCtx.AddressReg(64);
+ unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
- ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+ ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
AddressRegI64));
@@ -1030,7 +1024,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
} // End anonymous namespace
-X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI)
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
: STI(STI), InitialFrameReg(0) {}
X86AsmInstrumentation::~X86AsmInstrumentation() {}
@@ -1043,7 +1037,7 @@ void X86AsmInstrumentation::InstrumentAndEmitInstruction(
void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
const MCInst &Inst) {
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, *STI);
}
unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
@@ -1067,17 +1061,17 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
X86AsmInstrumentation *
CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx, const MCSubtargetInfo &STI) {
- Triple T(STI.getTargetTriple());
+ const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+ Triple T(STI->getTargetTriple());
const bool hasCompilerRTSupport = T.isOSLinux();
if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
MCOptions.SanitizeAddress) {
- if (STI.getFeatureBits()[X86::Mode32Bit] != 0)
+ if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
return new X86AddressSanitizer32(STI);
- if (STI.getFeatureBits()[X86::Mode64Bit] != 0)
+ if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
return new X86AddressSanitizer64(STI);
}
return new X86AsmInstrumentation(STI);
}
-} // End llvm namespace
+} // end llvm namespace
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 19ebcc44f61e..470ceadb0aa6 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -28,7 +28,8 @@ class X86AsmInstrumentation;
X86AsmInstrumentation *
CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx, const MCSubtargetInfo &STI);
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
class X86AsmInstrumentation {
public:
@@ -48,15 +49,16 @@ public:
protected:
friend X86AsmInstrumentation *
CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
- const MCContext &Ctx, const MCSubtargetInfo &STI);
+ const MCContext &Ctx,
+ const MCSubtargetInfo *&STI);
- X86AsmInstrumentation(const MCSubtargetInfo &STI);
+ X86AsmInstrumentation(const MCSubtargetInfo *&STI);
unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
- const MCSubtargetInfo &STI;
+ const MCSubtargetInfo *&STI;
unsigned InitialFrameReg;
};
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index bca059d8c383..4d8ffac1a82b 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -11,7 +11,6 @@
#include "X86AsmInstrumentation.h"
#include "X86AsmParserCommon.h"
#include "X86Operand.h"
-#include "X86ISelLowering.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
@@ -26,6 +25,7 @@
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -57,10 +57,10 @@ static const char OpPrecedence[] = {
};
class X86AsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
const MCInstrInfo &MII;
ParseInstructionInfo *InstInfo;
std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+
private:
SMLoc consumeToken() {
MCAsmParser &Parser = getParser();
@@ -154,6 +154,7 @@ private:
// Push the new operator.
InfixOperatorStack.push_back(Op);
}
+
int64_t execute() {
// Push any remaining operators onto the postfix stack.
while (!InfixOperatorStack.empty()) {
@@ -268,6 +269,7 @@ private:
bool StopOnLBrac, AddImmPrefix;
InfixCalculator IC;
InlineAsmIdentifierInfo Info;
+
public:
IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
@@ -712,10 +714,10 @@ private:
SMLoc End, unsigned Size, StringRef Identifier,
InlineAsmIdentifierInfo &Info);
+ bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
- bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops);
/// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
@@ -758,23 +760,24 @@ private:
bool is64BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return STI.getFeatureBits()[X86::Mode64Bit];
+ return getSTI().getFeatureBits()[X86::Mode64Bit];
}
bool is32BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return STI.getFeatureBits()[X86::Mode32Bit];
+ return getSTI().getFeatureBits()[X86::Mode32Bit];
}
bool is16BitMode() const {
// FIXME: Can tablegen auto-generate this?
- return STI.getFeatureBits()[X86::Mode16Bit];
+ return getSTI().getFeatureBits()[X86::Mode16Bit];
}
void SwitchMode(unsigned mode) {
+ MCSubtargetInfo &STI = copySTI();
FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
unsigned FB = ComputeAvailableFeatures(
STI.ToggleFeature(OldMode.flip(mode)));
setAvailableFeatures(FB);
-
+
assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
}
@@ -798,12 +801,12 @@ private:
/// }
public:
- X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser,
+ X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
const MCInstrInfo &mii, const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) {
+ : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) {
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
Instrumentation.reset(
CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
}
@@ -912,6 +915,11 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
if (RegNo == 0)
RegNo = MatchRegisterName(Tok.getString().lower());
+ // The "flags" register cannot be referenced directly.
+ // Treat it as an identifier instead.
+ if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+ RegNo = 0;
+
if (!is64BitMode()) {
// FIXME: This should be done using Requires<Not64BitMode> and
// Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
@@ -1042,8 +1050,11 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
.Cases("BYTE", "byte", 8)
.Cases("WORD", "word", 16)
.Cases("DWORD", "dword", 32)
+ .Cases("FWORD", "fword", 48)
.Cases("QWORD", "qword", 64)
+ .Cases("MMWORD","mmword", 64)
.Cases("XWORD", "xword", 80)
+ .Cases("TBYTE", "tbyte", 80)
.Cases("XMMWORD", "xmmword", 128)
.Cases("YMMWORD", "ymmword", 256)
.Cases("ZMMWORD", "zmmword", 512)
@@ -1062,8 +1073,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
// Insert an explicit size if the user didn't have one.
if (!Size) {
Size = getPointerWidth();
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
- /*Len=*/0, Size));
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
}
// Create an absolute memory reference in order to match against
@@ -1082,8 +1093,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
if (!Size) {
Size = Info.Type * 8; // Size is in terms of bits in this context.
if (Size)
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
- /*Len=*/0, Size));
+ InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+ /*Len=*/0, Size);
}
}
@@ -1097,13 +1108,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
}
static void
-RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
StringRef SymName, int64_t ImmDisp,
int64_t FinalImmDisp, SMLoc &BracLoc,
SMLoc &StartInBrac, SMLoc &End) {
// Remove the '[' and ']' from the IR string.
- AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1));
- AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1));
+ AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+ AsmRewrites.emplace_back(AOK_Skip, End, 1);
// If ImmDisp is non-zero, then we parsed a displacement before the
// bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
@@ -1114,15 +1125,14 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
// We have an immediate displacement before the bracketed expression.
// Adjust this to match the final immediate displacement.
bool Found = false;
- for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
- E = AsmRewrites->end(); I != E; ++I) {
- if ((*I).Loc.getPointer() > BracLoc.getPointer())
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() > BracLoc.getPointer())
continue;
- if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) {
+ if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
assert (!Found && "ImmDisp already rewritten.");
- (*I).Kind = AOK_Imm;
- (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer();
- (*I).Val = FinalImmDisp;
+ AR.Kind = AOK_Imm;
+ AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+ AR.Val = FinalImmDisp;
Found = true;
break;
}
@@ -1133,28 +1143,27 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
// We have a symbolic and an immediate displacement, but no displacement
// before the bracketed expression. Put the immediate displacement
// before the bracketed expression.
- AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp));
+ AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
}
}
// Remove all the ImmPrefix rewrites within the brackets.
- for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(),
- E = AsmRewrites->end(); I != E; ++I) {
- if ((*I).Loc.getPointer() < StartInBrac.getPointer())
+ for (AsmRewrite &AR : AsmRewrites) {
+ if (AR.Loc.getPointer() < StartInBrac.getPointer())
continue;
- if ((*I).Kind == AOK_ImmPrefix)
- (*I).Kind = AOK_Delete;
+ if (AR.Kind == AOK_ImmPrefix)
+ AR.Kind = AOK_Delete;
}
const char *SymLocPtr = SymName.data();
// Skip everything before the symbol.
if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
assert(Len > 0 && "Expected a non-negative length.");
- AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len));
+ AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
}
// Skip everything after the symbol.
if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
assert(Len > 0 && "Expected a non-negative length.");
- AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len));
+ AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
}
}
@@ -1162,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
+ AsmToken::TokenKind PrevTK = AsmToken::Error;
bool Done = false;
while (!Done) {
bool UpdateLocLex = true;
@@ -1205,7 +1215,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(Tok.getLoc(), "Unexpected identifier!");
} else {
// This is a dot operator, not an adjacent identifier.
- if (Identifier.find('.') != StringRef::npos) {
+ if (Identifier.find('.') != StringRef::npos &&
+ PrevTK == AsmToken::RBrac) {
return false;
} else {
InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
@@ -1223,8 +1234,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
case AsmToken::Integer: {
StringRef ErrMsg;
if (isParsingInlineAsm() && SM.getAddImmPrefix())
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
- Tok.getLoc()));
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
// Look for 'b' or 'f' following an Integer as a directional label
SMLoc Loc = getTok().getLoc();
int64_t IntVal = getTok().getIntVal();
@@ -1237,7 +1247,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
const MCExpr *Val =
- MCSymbolRefExpr::create(Sym, Variant, getContext());
+ MCSymbolRefExpr::create(Sym, Variant, getContext());
if (IDVal == "b" && Sym->isUndefined())
return Error(Loc, "invalid reference to undefined symbol");
StringRef Identifier = Sym->getName();
@@ -1275,6 +1285,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (!Done && UpdateLocLex)
End = consumeToken();
+
+ PrevTK = TK;
}
return false;
}
@@ -1302,7 +1314,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
// A symbolic displacement.
Disp = Sym;
if (isParsingInlineAsm())
- RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
+ RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
ImmDisp, SM.getImm(), BracLoc, StartInBrac,
End);
}
@@ -1359,7 +1371,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End) {
MCAsmParser &Parser = getParser();
- assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
Val = nullptr;
StringRef LineBuf(Identifier.data());
@@ -1372,15 +1384,17 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
// Advance the token stream until the end of the current token is
// after the end of what the frontend claimed.
const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
- while (true) {
+ do {
End = Tok.getEndLoc();
getLexer().Lex();
-
- assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
- if (End.getPointer() == EndPtr) break;
- }
+ } while (End.getPointer() < EndPtr);
Identifier = LineBuf;
+ // The frontend should end parsing on an assembler token boundary, unless it
+ // failed parsing.
+ assert((End.getPointer() == EndPtr || !Result) &&
+ "frontend claimed part of a token?");
+
// If the identifier lookup was unsuccessful, assume that we are dealing with
// a label.
if (!Result) {
@@ -1389,9 +1403,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
Loc, false);
assert(InternalName.size() && "We should have an internal name here.");
// Push a rewrite for replacing the identifier name with the internal name.
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc,
- Identifier.size(),
- InternalName));
+ InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+ InternalName);
}
// Create the symbol reference.
@@ -1418,8 +1431,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
if (isParsingInlineAsm())
- InstInfo->AsmRewrites->push_back(
- AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
if (getLexer().isNot(AsmToken::LBrac)) {
// An immediate following a 'segment register', 'colon' token sequence can
@@ -1588,8 +1600,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
unsigned Len = DotDispStr.size();
unsigned Val = OrigDispVal + DotDispVal;
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
- Val));
+ InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
}
NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
@@ -1613,7 +1624,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
return nullptr;
// Don't emit the offset operator.
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
// The offset operator will have an 'r' constraint, thus we need to create
// register operand to ensure proper matching. Just pick a GPR based on
@@ -1664,7 +1675,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
// Rewrite the type operator and the C or C++ type or variable in terms of an
// immediate. E.g. TYPE foo -> $$4
unsigned Len = End.getPointer() - TypeLoc.getPointer();
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
return X86Operand::CreateImm(Imm, Start, End);
@@ -1688,12 +1699,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
return ParseIntelOperator(IOK_TYPE);
}
+ bool PtrInOperand = false;
unsigned Size = getIntelMemOperandSize(Tok.getString());
if (Size) {
Parser.Lex(); // Eat operand size (e.g., byte, word).
if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
Parser.Lex(); // Eat ptr.
+ PtrInOperand = true;
}
Start = Tok.getLoc();
@@ -1711,10 +1724,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
if (StartTok.getString().size() == Len)
// Just add a prefix if this wasn't a complex immediate expression.
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
else
// Otherwise, rewrite the complex expression as a single immediate.
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm));
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
}
if (getLexer().isNot(AsmToken::LBrac)) {
@@ -1740,7 +1753,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
}
// rounding mode token
- if (STI.getFeatureBits()[X86::FeatureAVX512] &&
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
getLexer().is(AsmToken::LCurly))
return ParseRoundingModeOp(Start, End);
@@ -1749,9 +1762,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
if (!ParseRegister(RegNo, Start, End)) {
// If this is a segment register followed by a ':', then this is the start
// of a segment override, otherwise this is a normal register reference.
- if (getLexer().isNot(AsmToken::Colon))
+ // In case it is a normal register and there is ptr in the operand this
+ // is an error
+ if (getLexer().isNot(AsmToken::Colon)){
+ if (PtrInOperand){
+ return ErrorOperand(Start, "expected memory operand after "
+ "'ptr', found register operand instead");
+ }
return X86Operand::CreateReg(RegNo, Start, End);
-
+ }
+
return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
}
@@ -1798,7 +1818,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
}
case AsmToken::LCurly:{
SMLoc Start = Parser.getTok().getLoc(), End;
- if (STI.getFeatureBits()[X86::FeatureAVX512])
+ if (getSTI().getFeatureBits()[X86::FeatureAVX512])
return ParseRoundingModeOp(Start, End);
return ErrorOperand(Start, "unknown token in expression");
}
@@ -1808,7 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
const MCParsedAsmOperand &Op) {
MCAsmParser &Parser = getParser();
- if(STI.getFeatureBits()[X86::FeatureAVX512]) {
+ if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
if (getLexer().is(AsmToken::LCurly)) {
// Eat "{" and mark the current place.
const SMLoc consumedToken = consumeToken();
@@ -1983,12 +2003,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
}
// Validate the scale amount.
- if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
ScaleVal != 1) {
Error(Loc, "scale factor in 16-bit address must be 1");
return nullptr;
- }
- if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+ }
+ if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+ ScaleVal != 8) {
Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
return nullptr;
}
@@ -2175,7 +2196,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
Name == "repne" || Name == "repnz" ||
Name == "rex64" || Name == "data16";
-
// This does the actual operand parsing. Don't parse any more if we have a
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
@@ -2213,6 +2233,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
(isPrefix && getLexer().is(AsmToken::Slash)))
Parser.Lex();
+ // This is for gas compatibility and cannot be done in td.
+ // Adding "p" for some floating point with no argument.
+ // For example: fsub --> fsubp
+ bool IsFp =
+ Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+ if (IsFp && Operands.size() == 1) {
+ const char *Repl = StringSwitch<const char *>(Name)
+ .Case("fsub", "fsubp")
+ .Case("fdiv", "fdivp")
+ .Case("fsubr", "fsubrp")
+ .Case("fdivr", "fdivrp");
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+ }
+
// This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
// "outb %al, %dx". Out doesn't take a memory form, but this is a widely
// documented form in various unofficial manuals, so a lot of code uses it.
@@ -2242,9 +2276,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Append default arguments to "ins[bwld]"
if (Name.startswith("ins") && Operands.size() == 1 &&
- (Name == "insb" || Name == "insw" || Name == "insl" ||
- Name == "insd" )) {
- AddDefaultSrcDestOperands(Operands,
+ (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) {
+ AddDefaultSrcDestOperands(Operands,
X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
DefaultMemDIOperand(NameLoc));
}
@@ -2346,98 +2379,21 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// instalias with an immediate operand yet.
if (Name == "int" && Operands.size() == 2) {
X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
- if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
- cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
- Operands.erase(Operands.begin() + 1);
- static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
- }
+ if (Op1.isImm())
+ if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+ if (CE->getValue() == 3) {
+ Operands.erase(Operands.begin() + 1);
+ static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+ }
}
return false;
}
-static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
- bool isCmp) {
- MCInst TmpInst;
- TmpInst.setOpcode(Opcode);
- if (!isCmp)
- TmpInst.addOperand(MCOperand::createReg(Reg));
- TmpInst.addOperand(MCOperand::createReg(Reg));
- TmpInst.addOperand(Inst.getOperand(0));
- Inst = TmpInst;
- return true;
-}
-
-static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
- bool isCmp = false) {
- if (!Inst.getOperand(0).isImm() ||
- !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
- return false;
-
- return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
-}
-
-static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
- bool isCmp = false) {
- if (!Inst.getOperand(0).isImm() ||
- !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
- return false;
-
- return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
-}
-
-static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
- bool isCmp = false) {
- if (!Inst.getOperand(0).isImm() ||
- !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
- return false;
-
- return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
-}
-
-bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
- switch (Inst.getOpcode()) {
- default: return true;
- case X86::INT:
- X86Operand &Op = static_cast<X86Operand &>(*Ops[1]);
- assert(Op.isImm() && "expected immediate");
- int64_t Res;
- if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) {
- Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]");
- return false;
- }
- return true;
- }
- llvm_unreachable("handle the instruction appropriately");
-}
-
bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
switch (Inst.getOpcode()) {
default: return false;
- case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
- case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
- case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
- case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
- case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
- case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
- case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8);
- case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8);
- case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8);
- case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
- case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
- case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
- case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
- case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
- case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
- case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
- case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
- case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
- case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
- case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
- case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
- case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
- case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
- case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+ case X86::VMOVZPQILo2PQIrr:
case X86::VMOVAPDrr:
case X86::VMOVAPDYrr:
case X86::VMOVAPSrr:
@@ -2457,18 +2413,19 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
unsigned NewOpc;
switch (Inst.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
- case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
- case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
- case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
- case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
- case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
- case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
- case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
- case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
- case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
- case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
- case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
- case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
}
Inst.setOpcode(NewOpc);
return true;
@@ -2573,9 +2530,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
isParsingIntelSyntax())) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
- if (!validateInstruction(Inst, Operands))
- return true;
-
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the
// individual transformations can chain off each other.
@@ -2819,9 +2773,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
unsigned NumSuccessfulMatches =
std::count(std::begin(Match), std::end(Match), Match_Success);
if (NumSuccessfulMatches == 1) {
- if (!validateInstruction(Inst, Operands))
- return true;
-
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the individual
// transformations can chain off each other.
@@ -2898,10 +2849,29 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
"a '%' prefix in .intel_syntax");
}
return false;
- }
+ } else if (IDVal == ".even")
+ return parseDirectiveEven(DirectiveID.getLoc());
return true;
}
+/// parseDirectiveEven
+/// ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+ const MCSection *Section = getStreamer().getCurrentSection().first;
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ TokError("unexpected token in directive");
+ return false;
+ }
+ if (!Section) {
+ getStreamer().InitSections(false);
+ Section = getStreamer().getCurrentSection().first;
+ }
+ if (Section->UseCodeAlign())
+ getStreamer().EmitCodeAlignment(2, 0);
+ else
+ getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+ return false;
+}
/// ParseDirectiveWord
/// ::= .word [ expression (, expression)* ]
bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
@@ -2909,10 +2879,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
+ SMLoc ExprLoc = getLexer().getLoc();
if (getParser().parseExpression(Value))
return false;
- getParser().getStreamer().EmitValue(Value, Size);
+ if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+ assert(Size <= 8 && "Invalid size");
+ uint64_t IntValue = MCE->getValue();
+ if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+ return Error(ExprLoc, "literal value out of range for directive");
+ getStreamer().EmitIntValue(IntValue, Size);
+ } else {
+ getStreamer().EmitValue(Value, Size, ExprLoc);
+ }
if (getLexer().is(AsmToken::EndOfStatement))
break;
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 7610806c4578..54538c804a03 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -13,30 +13,25 @@
namespace llvm {
inline bool isImmSExti16i8Value(uint64_t Value) {
- return (( Value <= 0x000000000000007FULL)||
- (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isInt<8>(Value) ||
+ (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
}
inline bool isImmSExti32i8Value(uint64_t Value) {
- return (( Value <= 0x000000000000007FULL)||
- (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isInt<8>(Value) ||
+ (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
}
inline bool isImmSExti64i8Value(uint64_t Value) {
- return (( Value <= 0x000000000000007FULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isInt<8>(Value);
}
inline bool isImmSExti64i32Value(uint64_t Value) {
- return (( Value <= 0x000000007FFFFFFFULL)||
- (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isInt<32>(Value);
}
inline bool isImmUnsignedi8Value(uint64_t Value) {
- return (( Value <= 0x00000000000000FFULL)||
- (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ return isUInt<8>(Value) || isInt<8>(Value);
}
} // End of namespace llvm
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fba2c280d200..b23f5c353013 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -34,18 +34,9 @@ set(sources
X86VZeroUpper.cpp
X86FixupLEAs.cpp
X86WinEHState.cpp
+ X86OptimizeLEAs.cpp
)
-if( CMAKE_CL_64 )
- enable_language(ASM_MASM)
- ADD_CUSTOM_COMMAND(
- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
- MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
- COMMAND ${CMAKE_ASM_MASM_COMPILER} /nologo /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
- )
- set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
-endif()
-
add_llvm_target(X86CodeGen ${sources})
add_subdirectory(AsmParser)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index cfc3ee2fb08f..ce8fcf164668 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -95,11 +95,13 @@ X86GenericDisassembler::X86GenericDisassembler(
llvm_unreachable("Invalid CPU mode");
}
+namespace {
struct Region {
ArrayRef<uint8_t> Bytes;
uint64_t Base;
Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
};
+} // end anonymous namespace
/// A callback function that wraps the readByte method from Region.
///
@@ -831,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM256:
case TYPE_XMM512:
case TYPE_VK1:
+ case TYPE_VK2:
+ case TYPE_VK4:
case TYPE_VK8:
case TYPE_VK16:
+ case TYPE_VK32:
+ case TYPE_VK64:
case TYPE_DEBUGREG:
case TYPE_CONTROLREG:
case TYPE_BNDR:
@@ -962,6 +968,7 @@ static bool translateInstruction(MCInst &mcInst,
return true;
}
+ mcInst.clear();
mcInst.setOpcode(insn.instructionID);
// If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
// prefix bytes should be disassembled as xrelease and xacquire then set the
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index f73fa75f888e..040143b15587 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
* then it should be disassembled as a xacquire/xrelease not repne/rep.
*/
if ((byte == 0xf2 || byte == 0xf3) &&
- ((nextByte == 0xf0) |
+ ((nextByte == 0xf0) ||
((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
insn->xAcquireRelease = true;
/*
@@ -980,6 +980,47 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
insn->opcode == 0xE3)
attrMask ^= ATTR_ADSIZE;
+ /*
+ * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+ * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+ */
+
+ if (insn->mode == MODE_64BIT &&
+ isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+ switch (insn->opcode) {
+ case 0xE8:
+ case 0xE9:
+ // Take care of psubsb and other mmx instructions.
+ if (insn->opcodeType == ONEBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ case 0x82:
+ case 0x83:
+ case 0x84:
+ case 0x85:
+ case 0x86:
+ case 0x87:
+ case 0x88:
+ case 0x89:
+ case 0x8A:
+ case 0x8B:
+ case 0x8C:
+ case 0x8D:
+ case 0x8E:
+ case 0x8F:
+ // Take care of lea and three byte ops.
+ if (insn->opcodeType == TWOBYTE) {
+ attrMask ^= ATTR_OPSIZE;
+ insn->immediateSize = 4;
+ insn->displacementSize = 4;
+ }
+ break;
+ }
+ }
+
if (getIDWithAttrMask(&instructionID, insn, attrMask))
return -1;
@@ -1447,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) {
case TYPE_XMM: \
return prefix##_XMM0 + index; \
case TYPE_VK1: \
+ case TYPE_VK2: \
+ case TYPE_VK4: \
case TYPE_VK8: \
case TYPE_VK16: \
+ case TYPE_VK32: \
+ case TYPE_VK64: \
if (index > 7) \
*valid = 0; \
return prefix##_K0 + index; \
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index a79a923ac525..28a628e5066b 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -572,8 +572,6 @@ struct InternalInstruction {
// The last byte of the opcode, not counting any ModR/M extension
uint8_t opcode;
- // The ModR/M byte of the instruction, if it is an opcode extension
- uint8_t modRMExtension;
// decode state
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index ea727e6e82fb..b4c0bc4cd4d9 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -21,6 +21,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/FormattedStream.h"
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 62b6b73e7864..bbb309076610 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -15,12 +15,9 @@
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
-class MCOperand;
-
class X86ATTInstPrinter final : public MCInstPrinter {
public:
X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 91b144a44824..82f0ee5a5ebc 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -21,6 +21,27 @@
using namespace llvm;
+static unsigned getVectorRegSize(unsigned RegNo) {
+ if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+ return 512;
+ if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+ return 256;
+ if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+ return 128;
+ if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+ return 64;
+
+ llvm_unreachable("Unknown vector reg!");
+ return 0;
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+ unsigned OperandIndex) {
+ unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+ return MVT::getVectorVT(ScalarVT,
+ getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
/// \brief Extracts the src/dst types for a given zero extension instruction.
/// \note While the number of elements in DstVT type correct, the
/// number in the SrcVT type is expanded to fill the src xmm register and the
@@ -107,6 +128,75 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
}
}
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src: \
+ case X86::V##Inst##Suffix##src##k: \
+ case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_SSE_INS_COMMON(Inst, src) \
+ case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \
+ case X86::V##Inst##Suffix##src:
+
+#define CASE_MOVDUP(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src) \
+
+#define CASE_UNPCK(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src) \
+ CASE_AVX_INS_COMMON(Inst, , r##src) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src) \
+ CASE_SSE_INS_COMMON(Inst, r##src) \
+
+#define CASE_SHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \
+ CASE_AVX_INS_COMMON(Inst, , r##src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \
+ CASE_SSE_INS_COMMON(Inst, r##src##i) \
+
+#define CASE_VPERM(Inst, src) \
+ CASE_MASK_INS_COMMON(Inst, Z, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z256, src##i) \
+ CASE_MASK_INS_COMMON(Inst, Z128, src##i) \
+ CASE_AVX_INS_COMMON(Inst, , src##i) \
+ CASE_AVX_INS_COMMON(Inst, Y, src##i) \
+
+#define CASE_VSHUF(Inst, src) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+ CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \
+
+/// \brief Extracts the types and if it has memory operand for a given
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
+static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
+ HasMemOp = false;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+ break;
+ CASE_VSHUF(64X2, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF(64X2, r)
+ VT = getRegOperandVectorVT(MI, MVT::i64, 0);
+ break;
+ CASE_VSHUF(32X4, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF(32X4, r)
+ VT = getRegOperandVectorVT(MI, MVT::i32, 0);
+ break;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Top Level Entrypoint
//===----------------------------------------------------------------------===//
@@ -127,23 +217,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::BLENDPDrri:
case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
case X86::BLENDPDrmi:
case X86::VBLENDPDrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v2f64,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VBLENDPDYrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
case X86::VBLENDPDYrmi:
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v4f64,
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -152,23 +233,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::BLENDPSrri:
case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
case X86::BLENDPSrmi:
case X86::VBLENDPSrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v4f32,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VBLENDPSYrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
case X86::VBLENDPSYrmi:
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v8f32,
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -177,23 +249,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PBLENDWrri:
case X86::VPBLENDWrri:
+ case X86::VPBLENDWYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
case X86::PBLENDWrmi:
case X86::VPBLENDWrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v8i16,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VPBLENDWYrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
case X86::VPBLENDWYrmi:
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v16i16,
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -201,23 +264,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::VPBLENDDrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPBLENDDrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v4i32,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
-
case X86::VPBLENDDYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::VPBLENDDrmi:
case X86::VPBLENDDYrmi:
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeBLENDMask(MVT::v8i32,
+ DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -239,6 +292,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVLHPSrr:
case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -247,569 +301,327 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVHLPSrr:
case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVHLPSMask(2, ShuffleMask);
break;
- case X86::MOVSLDUPrr:
- case X86::VMOVSLDUPrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
- case X86::MOVSLDUPrm:
- case X86::VMOVSLDUPrm:
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
- break;
-
- case X86::VMOVSHDUPYrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
- case X86::VMOVSHDUPYrm:
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
- break;
-
- case X86::VMOVSLDUPYrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ CASE_MOVDUP(MOVSLDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
// FALL THROUGH.
- case X86::VMOVSLDUPYrm:
+ CASE_MOVDUP(MOVSLDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+ DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
break;
- case X86::MOVSHDUPrr:
- case X86::VMOVSHDUPrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ CASE_MOVDUP(MOVSHDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
// FALL THROUGH.
- case X86::MOVSHDUPrm:
- case X86::VMOVSHDUPrm:
+ CASE_MOVDUP(MOVSHDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+ DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
break;
- case X86::VMOVDDUPYrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ CASE_MOVDUP(MOVDDUP, r)
+ Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
// FALL THROUGH.
- case X86::VMOVDDUPYrm:
+ CASE_MOVDUP(MOVDDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask);
- break;
-
- case X86::MOVDDUPrr:
- case X86::VMOVDDUPrr:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
- case X86::MOVDDUPrm:
- case X86::VMOVDDUPrm:
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask);
+ DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
break;
case X86::PSLLDQri:
case X86::VPSLLDQri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSLLDQMask(MVT::v16i8,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
-
case X86::VPSLLDQYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSLLDQMask(MVT::v32i8,
+ DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
case X86::PSRLDQri:
case X86::VPSRLDQri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSRLDQMask(MVT::v16i8,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
-
case X86::VPSRLDQYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSRLDQMask(MVT::v32i8,
+ DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
case X86::PALIGNR128rr:
case X86::VPALIGNR128rr:
+ case X86::VPALIGNR256rr:
Src1Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
case X86::PALIGNR128rm:
case X86::VPALIGNR128rm:
- Src2Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePALIGNRMask(MVT::v16i8,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
- case X86::VPALIGNR256rr:
- Src1Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
case X86::VPALIGNR256rm:
Src2Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePALIGNRMask(MVT::v32i8,
+ DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
case X86::PSHUFDri:
case X86::VPSHUFDri:
+ case X86::VPSHUFDYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::PSHUFDmi:
case X86::VPSHUFDmi:
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v4i32,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
- case X86::VPSHUFDYri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
case X86::VPSHUFDYmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v8i32,
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
case X86::PSHUFHWri:
case X86::VPSHUFHWri:
+ case X86::VPSHUFHWYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::PSHUFHWmi:
case X86::VPSHUFHWmi:
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFHWMask(MVT::v8i16,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
- case X86::VPSHUFHWYri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
case X86::VPSHUFHWYmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFHWMask(MVT::v16i16,
+ DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
+
case X86::PSHUFLWri:
case X86::VPSHUFLWri:
+ case X86::VPSHUFLWYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::PSHUFLWmi:
case X86::VPSHUFLWmi:
- DestName = getRegName(MI->getOperand(0).getReg());
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFLWMask(MVT::v8i16,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- break;
- case X86::VPSHUFLWYri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
case X86::VPSHUFLWYmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFLWMask(MVT::v16i16,
+ DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
break;
- case X86::PUNPCKHBWrr:
- case X86::VPUNPCKHBWrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHBWrm:
- case X86::VPUNPCKHBWrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
- break;
- case X86::VPUNPCKHBWYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKHBWYrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
- break;
- case X86::PUNPCKHWDrr:
- case X86::VPUNPCKHWDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHWDrm:
- case X86::VPUNPCKHWDrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
- break;
- case X86::VPUNPCKHWDYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKHWDYrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
- break;
- case X86::PUNPCKHDQrr:
- case X86::VPUNPCKHDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHDQrm:
- case X86::VPUNPCKHDQrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
- break;
- case X86::VPUNPCKHDQYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKHDQYrm:
+ case X86::MMX_PSHUFWri:
Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
- break;
- case X86::VPUNPCKHDQZrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKHDQZrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v16i32, ShuffleMask);
- break;
- case X86::PUNPCKHQDQrr:
- case X86::VPUNPCKHQDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHQDQrm:
- case X86::VPUNPCKHQDQrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ case X86::MMX_PSHUFWmi:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
- break;
- case X86::VPUNPCKHQDQYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKHQDQYrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
- break;
- case X86::VPUNPCKHQDQZrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKHQDQZrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v8i64, ShuffleMask);
+ if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+ DecodePSHUFMask(MVT::v4i16,
+ MI->getOperand(MI->getNumOperands() - 1).getImm(),
+ ShuffleMask);
break;
- case X86::PUNPCKLBWrr:
- case X86::VPUNPCKLBWrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKLBWrm:
- case X86::VPUNPCKLBWrm:
+ case X86::PSWAPDrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
- break;
- case X86::VPUNPCKLBWYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VPUNPCKLBWYrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
- break;
- case X86::PUNPCKLWDrr:
- case X86::VPUNPCKLWDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::PUNPCKLWDrm:
- case X86::VPUNPCKLWDrm:
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ case X86::PSWAPDrm:
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+ DecodePSWAPMask(MVT::v2i32, ShuffleMask);
break;
- case X86::VPUNPCKLWDYrr:
+
+ CASE_UNPCK(PUNPCKHBW, r)
+ case X86::MMX_PUNPCKHBWirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKLWDYrm:
+ CASE_UNPCK(PUNPCKHBW, m)
+ case X86::MMX_PUNPCKHBWirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
break;
- case X86::PUNPCKLDQrr:
- case X86::VPUNPCKLDQrr:
+
+ CASE_UNPCK(PUNPCKHWD, r)
+ case X86::MMX_PUNPCKHWDirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::PUNPCKLDQrm:
- case X86::VPUNPCKLDQrm:
+ CASE_UNPCK(PUNPCKHWD, m)
+ case X86::MMX_PUNPCKHWDirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
break;
- case X86::VPUNPCKLDQYrr:
+
+ CASE_UNPCK(PUNPCKHDQ, r)
+ case X86::MMX_PUNPCKHDQirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKLDQYrm:
+ CASE_UNPCK(PUNPCKHDQ, m)
+ case X86::MMX_PUNPCKHDQirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
break;
- case X86::VPUNPCKLDQZrr:
+
+ CASE_UNPCK(PUNPCKHQDQ, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKLDQZrm:
+ CASE_UNPCK(PUNPCKHQDQ, m)
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v16i32, ShuffleMask);
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
break;
- case X86::PUNPCKLQDQrr:
- case X86::VPUNPCKLQDQrr:
+
+ CASE_UNPCK(PUNPCKLBW, r)
+ case X86::MMX_PUNPCKLBWirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::PUNPCKLQDQrm:
- case X86::VPUNPCKLQDQrm:
+ CASE_UNPCK(PUNPCKLBW, m)
+ case X86::MMX_PUNPCKLBWirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
break;
- case X86::VPUNPCKLQDQYrr:
+
+ CASE_UNPCK(PUNPCKLWD, r)
+ case X86::MMX_PUNPCKLWDirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKLQDQYrm:
+ CASE_UNPCK(PUNPCKLWD, m)
+ case X86::MMX_PUNPCKLWDirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
break;
- case X86::VPUNPCKLQDQZrr:
+
+ CASE_UNPCK(PUNPCKLDQ, r)
+ case X86::MMX_PUNPCKLDQirr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VPUNPCKLQDQZrm:
+ CASE_UNPCK(PUNPCKLDQ, m)
+ case X86::MMX_PUNPCKLDQirm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v8i64, ShuffleMask);
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
break;
- case X86::SHUFPDrri:
- case X86::VSHUFPDrri:
+ CASE_UNPCK(PUNPCKLQDQ, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::SHUFPDrmi:
- case X86::VSHUFPDrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeSHUFPMask(MVT::v2f64,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VSHUFPDYrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VSHUFPDYrmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeSHUFPMask(MVT::v4f64,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
+ CASE_UNPCK(PUNPCKLQDQ, m)
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
break;
- case X86::SHUFPSrri:
- case X86::VSHUFPSrri:
+ CASE_SHUF(SHUFPD, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::SHUFPSrmi:
- case X86::VSHUFPSrmi:
+ CASE_SHUF(SHUFPD, m)
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeSHUFPMask(MVT::v4f32,
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::VSHUFPSYrri:
+
+ CASE_SHUF(SHUFPS, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VSHUFPSYrmi:
+ CASE_SHUF(SHUFPS, m)
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodeSHUFPMask(MVT::v8f32,
+ DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::UNPCKLPDrr:
- case X86::VUNPCKLPDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKLPDrm:
- case X86::VUNPCKLPDrm:
- DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VUNPCKLPDYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VUNPCKLPDYrm:
- DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VUNPCKLPDZrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VUNPCKLPDZrm:
- DecodeUNPCKLMask(MVT::v8f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::UNPCKLPSrr:
- case X86::VUNPCKLPSrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKLPSrm:
- case X86::VUNPCKLPSrm:
- DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VUNPCKLPSYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VUNPCKLPSYrm:
- DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VUNPCKLPSZrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VUNPCKLPSZrm:
- DecodeUNPCKLMask(MVT::v16f32, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::UNPCKHPDrr:
- case X86::VUNPCKHPDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKHPDrm:
- case X86::VUNPCKHPDrm:
- DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VUNPCKHPDYrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::VUNPCKHPDYrm:
- DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(1).getReg());
+ CASE_VSHUF(64X2, r)
+ CASE_VSHUF(64X2, m)
+ CASE_VSHUF(32X4, r)
+ CASE_VSHUF(32X4, m) {
+ MVT VT;
+ bool HasMemOp;
+ unsigned NumOp = MI->getNumOperands();
+ getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
+ decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+ ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
+ if (HasMemOp) {
+ assert((NumOp >= 8) && "Expected at least 8 operands!");
+ Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
+ } else {
+ assert((NumOp >= 4) && "Expected at least 4 operands!");
+ Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
+ Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
+ }
break;
- case X86::VUNPCKHPDZrr:
+ }
+
+ CASE_UNPCK(UNPCKLPD, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VUNPCKHPDZrm:
- DecodeUNPCKHMask(MVT::v8f64, ShuffleMask);
+ CASE_UNPCK(UNPCKLPD, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::UNPCKHPSrr:
- case X86::VUNPCKHPSrr:
+
+ CASE_UNPCK(UNPCKLPS, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::UNPCKHPSrm:
- case X86::VUNPCKHPSrm:
- DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+ CASE_UNPCK(UNPCKLPS, m)
+ DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::VUNPCKHPSYrr:
+
+ CASE_UNPCK(UNPCKHPD, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VUNPCKHPSYrm:
- DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+ CASE_UNPCK(UNPCKHPD, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::VUNPCKHPSZrr:
+
+ CASE_UNPCK(UNPCKHPS, r)
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
- case X86::VUNPCKHPSZrm:
- DecodeUNPCKHMask(MVT::v16f32, ShuffleMask);
+ CASE_UNPCK(UNPCKHPS, m)
+ DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::VPERMILPSri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
- case X86::VPERMILPSmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v4f32,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VPERMILPSYri:
- Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
- case X86::VPERMILPSYmi:
- if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v8f32,
- MI->getOperand(MI->getNumOperands() - 1).getImm(),
- ShuffleMask);
- DestName = getRegName(MI->getOperand(0).getReg());
- break;
- case X86::VPERMILPDri:
+
+ CASE_VPERM(PERMILPS, r)
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
- case X86::VPERMILPDmi:
+ CASE_VPERM(PERMILPS, m)
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v2f64,
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
- case X86::VPERMILPDYri:
+
+ CASE_VPERM(PERMILPD, r)
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
- case X86::VPERMILPDYmi:
+ CASE_VPERM(PERMILPD, m)
if (MI->getOperand(MI->getNumOperands() - 1).isImm())
- DecodePSHUFMask(MVT::v4f64,
+ DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
MI->getOperand(MI->getNumOperands() - 1).getImm(),
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
+
case X86::VPERM2F128rr:
case X86::VPERM2I128rr:
Src2Name = getRegName(MI->getOperand(2).getReg());
@@ -824,6 +636,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
+
case X86::VPERMQYri:
case X86::VPERMPDYri:
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -846,6 +659,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
+
case X86::MOVSSrr:
case X86::VMOVSSrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
@@ -861,6 +675,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MOVZPQILo2PQIrr:
case X86::VMOVPQI2QIrr:
case X86::VMOVZPQILo2PQIrr:
+ case X86::VMOVZPQILo2PQIZrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::MOVQI2PQIrm:
@@ -869,9 +684,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVQI2PQIrm:
case X86::VMOVZQI2PQIrm:
case X86::VMOVZPQILo2PQIrm:
+ case X86::VMOVZPQILo2PQIZrm:
DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
+
case X86::MOVDI2PDIrm:
case X86::VMOVDI2PDIrm:
DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 6e371da37290..20cd7ffb2e63 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -19,8 +19,6 @@
namespace llvm {
-class MCOperand;
-
class X86IntelInstPrinter final : public MCInstPrinter {
public:
X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 629802f5dc5e..133bd0e1772a 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -69,15 +69,19 @@ public:
class X86AsmBackend : public MCAsmBackend {
const StringRef CPU;
bool HasNopl;
- const uint64_t MaxNopLength;
+ uint64_t MaxNopLength;
public:
- X86AsmBackend(const Target &T, StringRef CPU)
- : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) {
+ X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) {
HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
CPU != "c3" && CPU != "c3-2";
+ // Max length of true long nop instruction is 15 bytes.
+ // Max length of long nop replacement instruction is 7 bytes.
+ // Taking into account SilverMont architecture features max length of nops
+ // is reduced for it to achieve better performance.
+ MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15;
}
unsigned getNumFixupKinds() const override {
@@ -200,6 +204,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
case X86::ADD64ri8: return X86::ADD64ri32;
case X86::ADD64mi8: return X86::ADD64mi32;
+ // ADC
+ case X86::ADC16ri8: return X86::ADC16ri;
+ case X86::ADC16mi8: return X86::ADC16mi;
+ case X86::ADC32ri8: return X86::ADC32ri;
+ case X86::ADC32mi8: return X86::ADC32mi;
+ case X86::ADC64ri8: return X86::ADC64ri32;
+ case X86::ADC64mi8: return X86::ADC64mi32;
+
// SUB
case X86::SUB16ri8: return X86::SUB16ri;
case X86::SUB16mi8: return X86::SUB16mi;
@@ -208,6 +220,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
case X86::SUB64ri8: return X86::SUB64ri32;
case X86::SUB64mi8: return X86::SUB64mi32;
+ // SBB
+ case X86::SBB16ri8: return X86::SBB16ri;
+ case X86::SBB16mi8: return X86::SBB16mi;
+ case X86::SBB32ri8: return X86::SBB32ri;
+ case X86::SBB32mi8: return X86::SBB32mi;
+ case X86::SBB64ri8: return X86::SBB64ri32;
+ case X86::SBB64mi8: return X86::SBB64mi32;
+
// CMP
case X86::CMP16ri8: return X86::CMP16ri;
case X86::CMP16mi8: return X86::CMP16mi;
@@ -279,7 +299,7 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
/// bytes.
/// \return - true on success, false on failure
bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
- static const uint8_t Nops[10][10] = {
+ static const uint8_t TrueNops[10][10] = {
// nop
{0x90},
// xchg %ax,%ax
@@ -302,17 +322,31 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
{0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
};
- // This CPU doesn't support long nops. If needed add more.
- // FIXME: Can we get this from the subtarget somehow?
- // FIXME: We could generated something better than plain 0x90.
- if (!HasNopl) {
- for (uint64_t i = 0; i < Count; ++i)
- OW->write8(0x90);
- return true;
- }
+ // Alternative nop instructions for CPUs which don't support long nops.
+ static const uint8_t AltNops[7][10] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0x76, 0x00},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0x74, 0x26, 0x00},
+ // nop + lea 0x0(%esi),%esi
+ {0x90, 0x8d, 0x74, 0x26, 0x00},
+ // lea 0x0(%esi),%esi
+ {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 },
+ // lea 0x0(%esi),%esi
+ {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00},
+ };
+
+ // Select the right NOP table.
+ // FIXME: Can we get if CPU supports long nops from the subtarget somehow?
+ const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops;
+ assert(HasNopl || MaxNopLength <= 7);
- // 15 is the longest single nop instruction. Emit as many 15-byte nops as
- // needed, then emit a nop of the remaining length.
+ // Emit as many largest nops as needed, then emit a nop of the remaining
+ // length.
do {
const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
@@ -359,6 +393,17 @@ public:
}
};
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+ : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+ ELF::EM_IAMCU);
+ }
+};
+
class ELFX86_64AsmBackend : public ELFX86AsmBackend {
public:
ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
@@ -610,13 +655,13 @@ private:
/// \brief Get the compact unwind number for a given register. The number
/// corresponds to the enum lists in compact_unwind_encoding.h.
int getCompactUnwindRegNum(unsigned Reg) const {
- static const uint16_t CU32BitRegs[7] = {
+ static const MCPhysReg CU32BitRegs[7] = {
X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
};
- static const uint16_t CU64BitRegs[] = {
+ static const MCPhysReg CU64BitRegs[] = {
X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
};
- const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+ const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
if (*CURegs == Reg)
return Idx;
@@ -780,6 +825,10 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
return new WindowsX86AsmBackend(T, false, CPU);
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+ if (TheTriple.isOSIAMCU())
+ return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
return new ELFX86_32AsmBackend(T, OSABI, CPU);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index f0d00b0c1bc3..9ff85b9154f8 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -41,6 +41,16 @@ namespace X86 {
/// AddrNumOperands - Total number of operands in a memory reference.
AddrNumOperands = 5
};
+
+ /// AVX512 static rounding constants. These need to match the values in
+ /// avx512fintrin.h.
+ enum STATIC_ROUNDING {
+ TO_NEAREST_INT = 0,
+ TO_NEG_INF = 1,
+ TO_POS_INF = 2,
+ TO_ZERO = 3,
+ CUR_DIRECTION = 4
+ };
} // end namespace X86;
/// X86II - This namespace holds all of the target specific flags that
@@ -675,7 +685,7 @@ namespace X86II {
case X86II::RawFrmSrc:
case X86II::RawFrmDst:
case X86II::RawFrmDstSrc:
- return -1;
+ return -1;
case X86II::MRMDestMem:
return 0;
case X86II::MRMSrcMem:
@@ -696,23 +706,27 @@ namespace X86II {
// Start from 0, skip registers encoded in VEX_VVVV or a mask register.
return 0 + HasVEX_4V + HasEVEX_K;
case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
- case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+ case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+ case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+ case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
- case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
- case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9:
- case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
- case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
- case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2:
- case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5:
- case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA:
- case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED:
- case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1:
- case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4:
- case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7:
- case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA:
- case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD:
- case X86II::MRM_FE: case X86II::MRM_FF:
+ case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+ case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+ case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+ case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+ case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+ case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+ case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+ case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+ case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+ case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+ case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+ case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+ case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+ case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+ case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_FF:
return -1;
}
}
@@ -740,7 +754,7 @@ namespace X86II {
case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
- return true;
+ return true;
}
return false;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index a33468dc4769..736c39dfb6f1 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -32,9 +32,11 @@ namespace {
X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
uint16_t EMachine)
- : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
- // Only i386 uses Rel instead of RelA.
- /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
+ : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+ // Only i386 and IAMCU use Rel instead of RelA.
+ /*HasRelocationAddend*/
+ (EMachine != ELF::EM_386) &&
+ (EMachine != ELF::EM_IAMCU)) {}
X86ELFObjectWriter::~X86ELFObjectWriter()
{}
@@ -246,7 +248,8 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
if (getEMachine() == ELF::EM_X86_64)
return getRelocType64(Modifier, Type, IsPCRel);
- assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type.");
+ assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+ "Unsupported ELF machine type.");
return getRelocType32(Modifier, getType32(Type), IsPCRel);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index deaad2a5b8e8..30d5c802d1ed 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -20,39 +20,42 @@
#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
- class Triple;
-
- class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
- virtual void anchor();
-
- public:
- explicit X86MCAsmInfoDarwin(const Triple &Triple);
- };
-
- struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
- explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
- const MCExpr *
- getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
- MCStreamer &Streamer) const override;
- };
-
- class X86ELFMCAsmInfo : public MCAsmInfoELF {
- void anchor() override;
- public:
- explicit X86ELFMCAsmInfo(const Triple &Triple);
- };
-
- class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
- void anchor() override;
- public:
- explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
- };
-
- class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
- void anchor() override;
- public:
- explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
- };
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ virtual void anchor();
+
+public:
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+ explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+ const MCExpr *
+ getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
} // namespace llvm
#endif
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 10c434c8b1b4..dfab6ec10775 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -510,8 +510,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
- Fixups);
+ EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ CurByte, OS, Fixups);
return;
}
@@ -988,6 +988,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
const MCInstrDesc &Desc) {
unsigned REX = 0;
+ bool UsesHighByteReg = false;
+
if (TSFlags & X86II::REX_W)
REX |= 1 << 3; // set REX.W
@@ -1004,6 +1006,8 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
const MCOperand &MO = MI.getOperand(i);
if (!MO.isReg()) continue;
unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ UsesHighByteReg = true;
if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
// FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
// that returns non-zero.
@@ -1073,6 +1077,9 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
}
break;
}
+ if (REX && UsesHighByteReg)
+ report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
return REX;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 83b4091d7665..53a6550acdd5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -122,7 +122,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
} else if (TheTriple.isOSBinFormatELF()) {
// Force the use of an ELF container.
MAI = new X86ELFMCAsmInfo(TheTriple);
- } else if (TheTriple.isWindowsMSVCEnvironment()) {
+ } else if (TheTriple.isWindowsMSVCEnvironment() ||
+ TheTriple.isWindowsCoreCLREnvironment()) {
MAI = new X86MCAsmInfoMicrosoft(TheTriple);
} else if (TheTriple.isOSCygMing() ||
TheTriple.isWindowsItaniumEnvironment()) {
@@ -267,3 +268,184 @@ extern "C" void LLVMInitializeX86TargetMC() {
TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
createX86_64AsmBackend);
}
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+ bool High) {
+ switch (Size) {
+ default: return 0;
+ case 8:
+ if (High) {
+ switch (Reg) {
+ default: return getX86SubSuperRegisterOrZero(Reg, 64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case 16:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case 32:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case 64:
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+ unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != 0 && "Unexpected register or VT");
+ return Res;
+}
+
+
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 6221baba1793..2d2836ff07c5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -79,7 +79,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
/// Takes ownership of \p AB and \p CE.
MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
raw_pwrite_stream &OS, MCCodeEmitter *CE,
- bool RelaxAll);
+ bool RelaxAll, bool IncrementalLinkerCompatible);
/// Construct an X86 Mach-O object writer.
MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
@@ -98,6 +98,17 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
/// Construct X86-64 ELF relocation info.
MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+ bool High = false);
+
} // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 9e801fc8f191..191ebeac7265 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation(
// Neither symbol can be modified.
if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
- Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
- report_fatal_error("unsupported relocation of modified symbol", false);
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
// We don't support PCrel relocations of differences. Darwin 'as' doesn't
// implement most of these correctly.
- if (IsPCRel)
- report_fatal_error("unsupported pc-relative relocation of difference",
- false);
+ if (IsPCRel) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+ return;
+ }
// The support for the situation where one or both of the symbols would
// require a local relocation is handled just like if the symbols were
@@ -168,16 +173,20 @@ void X86MachObjectWriter::RecordX86_64Relocation(
// Darwin 'as' doesn't emit correct relocations for this (it ends up with a
// single SIGNED relocation); reject it for now. Except the case where both
// symbols don't have a base, equal but both NULL.
- if (A_Base == B_Base && A_Base)
- report_fatal_error("unsupported relocation with identical base", false);
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
// A subtraction expression where either symbol is undefined is a
// non-relocatable expression.
if (A->isUndefined() || B->isUndefined()) {
StringRef Name = A->isUndefined() ? A->getName() : B->getName();
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ Asm.getContext().reportError(Fixup.getLoc(),
"unsupported relocation with subtraction expression, symbol '" +
Name + "' can not be undefined in a subtraction expression");
+ return;
}
Value += Writer->getSymbolAddress(*A, Layout) -
@@ -244,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(
FixedValue = Res;
return;
} else {
- report_fatal_error("unsupported relocation of variable '" +
- Symbol->getName() + "'", false);
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of variable '" +
+ Symbol->getName() + "'");
+ return;
}
} else {
- report_fatal_error("unsupported relocation of undefined symbol '" +
- Symbol->getName() + "'", false);
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+ Symbol->getName() + "'");
+ return;
}
MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -266,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(
} else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
Type = MachO::X86_64_RELOC_TLV;
} else if (Modifier != MCSymbolRefExpr::VK_None) {
- report_fatal_error("unsupported symbol modifier in relocation",
- false);
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
} else {
Type = MachO::X86_64_RELOC_SIGNED;
@@ -292,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
}
}
} else {
- if (Modifier != MCSymbolRefExpr::VK_None)
- report_fatal_error("unsupported symbol modifier in branch "
- "relocation", false);
+ if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "unsupported symbol modifier in branch relocation");
+ return;
+ }
Type = MachO::X86_64_RELOC_BRANCH;
}
@@ -309,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation(
Type = MachO::X86_64_RELOC_GOT;
IsPCRel = 1;
} else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
- report_fatal_error("TLVP symbol modifier should have been rip-rel",
- false);
- } else if (Modifier != MCSymbolRefExpr::VK_None)
- report_fatal_error("unsupported symbol modifier in relocation", false);
- else {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+ return;
+ } else if (Modifier != MCSymbolRefExpr::VK_None) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported symbol modifier in relocation");
+ return;
+ } else {
Type = MachO::X86_64_RELOC_UNSIGNED;
unsigned Kind = Fixup.getKind();
- if (Kind == X86::reloc_signed_4byte)
- report_fatal_error("32-bit absolute addressing is not supported in "
- "64-bit mode", false);
+ if (Kind == X86::reloc_signed_4byte) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "32-bit absolute addressing is not supported in 64-bit mode");
+ return;
+ }
}
}
}
@@ -350,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
// See <reloc.h>.
const MCSymbol *A = &Target.getSymA()->getSymbol();
- if (!A->getFragment())
- report_fatal_error("symbol '" + A->getName() +
- "' can not be undefined in a subtraction expression",
- false);
+ if (!A->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + A->getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
uint32_t Value = Writer->getSymbolAddress(*A, Layout);
uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -363,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
if (const MCSymbolRefExpr *B = Target.getSymB()) {
const MCSymbol *SB = &B->getSymbol();
- if (!SB->getFragment())
- report_fatal_error("symbol '" + B->getSymbol().getName() +
- "' can not be undefined in a subtraction expression",
- false);
+ if (!SB->getFragment()) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(),
+ "symbol '" + B->getSymbol().getName() +
+ "' can not be undefined in a subtraction expression");
+ return false;
+ }
// Select the appropriate difference relocation type.
//
@@ -387,12 +416,12 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
if (FixupOffset > 0xffffff) {
char Buffer[32];
format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ Asm.getContext().reportError(Fixup.getLoc(),
Twine("Section too large, can't encode "
"r_address (") + Buffer +
") into 24 bits of scattered "
"relocation entry.");
- llvm_unreachable("fatal error returned?!");
+ return false;
}
MachO::any_relocation_info MRE;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 92f42b68ae51..d04511873b46 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -50,9 +50,11 @@ void X86WinCOFFStreamer::FinishImpl() {
MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
raw_pwrite_stream &OS,
- MCCodeEmitter *CE, bool RelaxAll) {
+ MCCodeEmitter *CE, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
S->getAssembler().setRelaxAll(RelaxAll);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
}
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cae865a40819..4fdd527d87c8 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -140,13 +140,14 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm,
}
}
-/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
unsigned NewImm = Imm;
@@ -191,6 +192,16 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm,
}
}
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumHalfElts = NumElts / 2;
+
+ for (unsigned l = 0; l != NumHalfElts; ++l)
+ ShuffleMask.push_back(l + NumHalfElts);
+ for (unsigned h = 0; h != NumHalfElts; ++h)
+ ShuffleMask.push_back(h);
+}
+
/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
/// the type of the vector allowing it to handle different datatypes and vector
/// widths.
@@ -222,7 +233,7 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
unsigned NumLanes = VT.getSizeInBits() / 128;
- if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
+ if (NumLanes == 0) NumLanes = 1; // Handle MMX
unsigned NumLaneElts = NumElts / NumLanes;
for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
@@ -253,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
}
}
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+ unsigned ControlBitsMask = NumLanes - 1;
+ unsigned NumControlBits = NumLanes / 2;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+ // We actually need the other source.
+ if (l >= NumLanes / 2)
+ LaneMask += NumLanes;
+ for (unsigned i = 0; i != NumElementsInLane; ++i)
+ ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+ }
+}
+
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
unsigned HalfSize = VT.getVectorNumElements() / 2;
@@ -277,10 +308,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
// <4 x i32> <i32 -2147483648, i32 -2147483648,
// i32 -2147483648, i32 -2147483648>
+#ifndef NDEBUG
unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-
- if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
- return;
+ assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
+#endif
// This is a straightforward byte vector.
if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
@@ -290,7 +321,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
for (int i = 0; i < NumElements; ++i) {
// For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
// lane of the vector we're inside.
- int Base = i < 16 ? 0 : 16;
+ int Base = i & ~0xf;
Constant *COp = C->getAggregateElement(i);
if (!COp) {
ShuffleMask.clear();
@@ -357,44 +388,66 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
- assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
- assert(MaskTy->getVectorElementType()->isIntegerTy() &&
- "Expected integer constant mask elements!");
- int ElementBits = MaskTy->getScalarSizeInBits();
- int NumElements = MaskTy->getVectorNumElements();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+ if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+ return;
+
+ // Only support vector types.
+ if (!MaskTy->isVectorTy())
+ return;
+
+ // Make sure its an integer type.
+ Type *VecEltTy = MaskTy->getVectorElementType();
+ if (!VecEltTy->isIntegerTy())
+ return;
+
+ // Support any element type from byte up to element size.
+ // This is necesary primarily because 64-bit elements get split to 32-bit
+ // in the constant pool on 32-bit target.
+ unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+ if (EltTySize < 8 || EltTySize > ElSize)
+ return;
+
+ unsigned NumElements = MaskTySize / ElSize;
assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
"Unexpected number of vector elements.");
ShuffleMask.reserve(NumElements);
- if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
- assert((unsigned)NumElements == CDS->getNumElements() &&
- "Constant mask has a different number of elements!");
-
- for (int i = 0; i < NumElements; ++i) {
- int Base = (i * ElementBits / 128) * (128 / ElementBits);
- uint64_t Element = CDS->getElementAsInteger(i);
- // Only the least significant 2 bits of the integer are used.
- int Index = Base + (Element & 0x3);
- ShuffleMask.push_back(Index);
- }
- } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
- assert((unsigned)NumElements == C->getNumOperands() &&
- "Constant mask has a different number of elements!");
-
- for (int i = 0; i < NumElements; ++i) {
- int Base = (i * ElementBits / 128) * (128 / ElementBits);
- Constant *COp = CV->getOperand(i);
- if (isa<UndefValue>(COp)) {
- ShuffleMask.push_back(SM_SentinelUndef);
- continue;
- }
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
- // Only the least significant 2 bits of the integer are used.
- int Index = Base + (Element & 0x3);
- ShuffleMask.push_back(Index);
+ unsigned NumElementsPerLane = 128 / ElSize;
+ unsigned Factor = ElSize / EltTySize;
+
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i * Factor);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
}
+ int Index = i & ~(NumElementsPerLane - 1);
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ if (ElSize == 64)
+ Index += (Element >> 1) & 0x1;
+ else
+ Index += Element & 0x3;
+ ShuffleMask.push_back(Index);
}
+
+ // TODO: Handle funny-looking vectors too.
}
void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
@@ -503,4 +556,74 @@ void DecodeINSERTQIMask(int Len, int Idx,
ShuffleMask.push_back(SM_SentinelUndef);
}
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask) {
+ for (int i = 0, e = RawMask.size(); i < e; ++i) {
+ uint64_t M = RawMask[i];
+ ShuffleMask.push_back((int)M);
+ }
+}
+
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ if (MaskTy->isVectorTy()) {
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+ return;
+ }
+ // Scalar value; just broadcast it
+ if (!isa<ConstantInt>(C))
+ return;
+ uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
+ int NumElements = VT.getVectorNumElements();
+ Element &= (1 << NumElements) - 1;
+ for (int i = 0; i < NumElements; ++i)
+ ShuffleMask.push_back(Element);
+}
+
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements*2) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+}
} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 3d10d18e860e..ab18e6438ec9 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -54,6 +54,9 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+/// \brief Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
/// the type of the vector allowing it to handle different datatypes and vector
/// widths.
@@ -83,12 +86,18 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
/// No VT provided since it only works on 256-bit, 4 element vectors.
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
/// \brief Decode a zero extension instruction as a shuffle mask.
void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
@@ -108,6 +117,22 @@ void DecodeEXTRQIMask(int Len, int Idx,
/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
void DecodeINSERTQIMask(int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+ SmallVectorImpl<int> &ShuffleMask);
} // llvm namespace
#endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 8403ae6101df..fbec6626d99d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -23,56 +23,47 @@ class FunctionPass;
class ImmutablePass;
class X86TargetMachine;
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
FunctionPass *createX86ISelDag(X86TargetMachine &TM,
CodeGenOpt::Level OptLevel);
-/// createX86GlobalBaseRegPass - This pass initializes a global base
-/// register for PIC on x86-32.
+/// This pass initializes a global base register for PIC on x86-32.
FunctionPass* createX86GlobalBaseRegPass();
-/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
-/// to local-dynamic TLS variables so that the TLS base address for the module
-/// is only fetched once per execution path through the function.
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
FunctionPass *createCleanupLocalDynamicTLSPass();
-/// createX86FloatingPointStackifierPass - This function returns a pass which
-/// converts floating point register references and pseudo instructions into
-/// floating point stack references and physical instructions.
-///
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
FunctionPass *createX86FloatingPointStackifierPass();
-/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
-/// before each call to avoid transition penalty between functions encoded with
-/// AVX and SSE.
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
FunctionPass *createX86IssueVZeroUpperPass();
-/// createX86EmitCodeToMemory - Returns a pass that converts a register
-/// allocated function into raw machine code in a dynamically
-/// allocated chunk of memory.
-///
-FunctionPass *createEmitX86CodeToMemory();
-
-/// createX86PadShortFunctions - Return a pass that pads short functions
-/// with NOOPs. This will prevent a stall when returning on the Atom.
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
FunctionPass *createX86PadShortFunctions();
-/// createX86FixupLEAs - Return a a pass that selectively replaces
-/// certain instructions (like add, sub, inc, dec, some shifts,
-/// and some multiplies) by equivalent LEA instructions, in order
-/// to eliminate execution delays in some Atom processors.
+
+/// Return a a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
FunctionPass *createX86FixupLEAs();
-/// createX86CallFrameOptimization - Return a pass that optimizes
-/// the code-size of x86 call sequences. This is done by replacing
-/// esp-relative movs with pushes.
+/// Return a pass that removes redundant address recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
FunctionPass *createX86CallFrameOptimization();
-/// createX86WinEHStatePass - Return an IR pass that inserts EH registration
-/// stack objects and explicit EH state updates. This pass must run after EH
-/// preparation, which does Windows-specific but architecture-neutral
-/// preparation.
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
FunctionPass *createX86WinEHStatePass();
/// Return a Machine IR pass that expands X86-specific pseudo
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 852267400bba..8902a8534256 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -37,14 +37,26 @@ def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
"Support POPCNT instruction">;
+def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
+ "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
+ "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+ "Support xsaveopt instructions">;
+
+def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+ "Support xsavec instructions">;
+
+def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+ "Support xsaves instructions">;
-def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX",
- "Enable MMX instructions">;
def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
"Enable SSE instructions",
// SSE codegen depends on cmovs, and all
// SSE1+ processors support them.
- [FeatureMMX, FeatureCMOV]>;
+ [FeatureCMOV]>;
def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
"Enable SSE2 instructions",
[FeatureSSE1]>;
@@ -60,6 +72,11 @@ def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
"Enable SSE 4.2 instructions",
[FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+ "Enable MMX instructions">;
def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
"Enable 3DNow! instructions",
[FeatureMMX]>;
@@ -79,16 +96,13 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
"Bit testing of memory is slow">;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
-// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that
-// explicit. Also, it seems this would be the default state for most chips
-// going forward, so it would probably be better to negate the logic and
-// match the 32-byte "slow mem" feature below.
-def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
- "IsUAMemFast", "true",
- "Fast unaligned memory access">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+ "IsUAMem16Slow", "true",
+ "Slow unaligned 16-byte memory access">;
def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
- "IsUAMem32Slow", "true",
- "Slow unaligned 32-byte memory access">;
+ "IsUAMem32Slow", "true",
+ "Slow unaligned 32-byte memory access">;
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
"Support SSE 4a instructions",
[FeatureSSE3]>;
@@ -120,6 +134,8 @@ def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
"Enable AVX-512 Vector Length eXtensions",
[FeatureAVX512]>;
+def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
+ "Enable protection keys">;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
@@ -168,9 +184,11 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
"Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+ "Support LAHF and SAHF instructions">;
def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
"Support MPX instructions">;
-def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
@@ -181,6 +199,11 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
"CallRegIndirect", "true",
"Call register indirect">;
@@ -208,278 +231,473 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
-def : Proc<"generic", []>;
-def : Proc<"i386", []>;
-def : Proc<"i486", []>;
-def : Proc<"i586", []>;
-def : Proc<"pentium", []>;
-def : Proc<"pentium-mmx", [FeatureMMX]>;
-def : Proc<"i686", []>;
-def : Proc<"pentiumpro", [FeatureCMOV]>;
-def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>;
-def : Proc<"pentium3", [FeatureSSE1]>;
-def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>;
-def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"pentium4", [FeatureSSE2]>;
-def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"generic", [FeatureSlowUAMem16]>;
+def : Proc<"i386", [FeatureSlowUAMem16]>;
+def : Proc<"i486", [FeatureSlowUAMem16]>;
+def : Proc<"i586", [FeatureSlowUAMem16]>;
+def : Proc<"pentium", [FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686", [FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV,
+ FeatureFXSR]>;
+def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+ FeatureFXSR]>;
+def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+ FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR]>;
+def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureSlowBTMem]>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
- [FeatureSSE3, FeatureSlowBTMem]>;
+ [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+ FeatureSlowBTMem]>;
// NetBurst.
-def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : Proc<"prescott",
+ [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+ FeatureSlowBTMem]>;
+def : Proc<"nocona", [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem
+]>;
// Intel Core 2 Solo/Duo.
-def : ProcessorModel<"core2", SandyBridgeModel,
- [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
-def : ProcessorModel<"penryn", SandyBridgeModel,
- [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"core2", SandyBridgeModel, [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSE41,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
// Atom CPUs.
class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
- ProcIntelAtom,
- FeatureSSSE3,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeatureSlowBTMem,
- FeatureLeaForSP,
- FeatureSlowDivide32,
- FeatureSlowDivide64,
- FeatureCallRegIndirect,
- FeatureLEAUsesAG,
- FeaturePadShortFunctions
- ]>;
+ ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureSlowBTMem,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureLAHFSAHF
+]>;
def : BonnellProc<"bonnell">;
def : BonnellProc<"atom">; // Pin the generic name to the baseline.
class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
- ProcIntelSLM,
- FeatureSSE42,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeaturePOPCNT,
- FeaturePCLMUL,
- FeatureAES,
- FeatureSlowDivide64,
- FeatureCallRegIndirect,
- FeaturePRFCHW,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureSlowBTMem,
- FeatureFastUAMem
- ]>;
+ ProcIntelSLM,
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeaturePOPCNT,
+ FeaturePCLMUL,
+ FeatureAES,
+ FeatureSlowDivide64,
+ FeatureCallRegIndirect,
+ FeaturePRFCHW,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowBTMem,
+ FeatureLAHFSAHF
+]>;
def : SilvermontProc<"silvermont">;
def : SilvermontProc<"slm">; // Legacy alias.
// "Arrandale" along with corei3 and corei5
class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureSSE42,
- FeatureCMPXCHG16B,
- FeatureSlowBTMem,
- FeatureFastUAMem,
- FeaturePOPCNT
- ]>;
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF
+]>;
def : NehalemProc<"nehalem">;
def : NehalemProc<"corei7">;
// Westmere is a similar machine to nehalem with some additional features.
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureSSE42,
- FeatureCMPXCHG16B,
- FeatureSlowBTMem,
- FeatureFastUAMem,
- FeaturePOPCNT,
- FeatureAES,
- FeaturePCLMUL
- ]>;
+ FeatureMMX,
+ FeatureSSE42,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureLAHFSAHF
+]>;
def : WestmereProc<"westmere">;
// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
// rather than a superset.
class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureAVX,
- FeatureCMPXCHG16B,
- FeatureFastUAMem,
- FeatureSlowUAMem32,
- FeaturePOPCNT,
- FeatureAES,
- FeaturePCLMUL
- ]>;
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureLAHFSAHF
+]>;
def : SandyBridgeProc<"sandybridge">;
def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
- FeatureAVX,
- FeatureCMPXCHG16B,
- FeatureFastUAMem,
- FeatureSlowUAMem32,
- FeaturePOPCNT,
- FeatureAES,
- FeaturePCLMUL,
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase
- ]>;
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeatureSlowUAMem32,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
def : IvyBridgeProc<"ivybridge">;
def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
- FeatureAVX2,
- FeatureCMPXCHG16B,
- FeatureFastUAMem,
- FeaturePOPCNT,
- FeatureAES,
- FeaturePCLMUL,
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureBMI,
- FeatureBMI2,
- FeatureFMA,
- FeatureRTM,
- FeatureHLE,
- FeatureSlowIncDec
- ]>;
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureRDRAND,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureSlowIncDec,
+ FeatureLAHFSAHF
+]>;
def : HaswellProc<"haswell">;
def : HaswellProc<"core-avx2">; // Legacy alias.
class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
- FeatureAVX2,
- FeatureCMPXCHG16B,
- FeatureFastUAMem,
- FeaturePOPCNT,
- FeatureAES,
- FeaturePCLMUL,
- FeatureRDRAND,
- FeatureF16C,
- FeatureFSGSBase,
- FeatureMOVBE,
- FeatureLZCNT,
- FeatureBMI,
- FeatureBMI2,
- FeatureFMA,
- FeatureRTM,
- FeatureHLE,
- FeatureADX,
- FeatureRDSEED,
- FeatureSlowIncDec
- ]>;
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureSlowIncDec,
+ FeatureLAHFSAHF
+]>;
def : BroadwellProc<"broadwell">;
// FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel,
- [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
- FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
- FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
- FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
- FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
- FeatureSlowIncDec, FeatureMPX]>;
+class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX512,
+ FeatureFXSR,
+ FeatureERI,
+ FeatureCDI,
+ FeaturePFI,
+ FeatureCMPXCHG16B,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureSlowIncDec,
+ FeatureMPX,
+ FeatureLAHFSAHF
+]>;
def : KnightsLandingProc<"knl">;
// FIXME: define SKX model
-class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel,
- [FeatureAVX512, FeatureCDI,
- FeatureDQI, FeatureBWI, FeatureVLX,
- FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
- FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
- FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
- FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
- FeatureSlowIncDec, FeatureMPX]>;
+class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
+ FeatureMMX,
+ FeatureAVX512,
+ FeatureFXSR,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureBWI,
+ FeatureVLX,
+ FeaturePKU,
+ FeatureCMPXCHG16B,
+ FeatureSlowBTMem,
+ FeaturePOPCNT,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureRDRAND,
+ FeatureF16C,
+ FeatureFSGSBase,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureFMA,
+ FeatureRTM,
+ FeatureHLE,
+ FeatureADX,
+ FeatureRDSEED,
+ FeatureSlowIncDec,
+ FeatureMPX,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureLAHFSAHF
+]>;
def : SkylakeProc<"skylake">;
def : SkylakeProc<"skx">; // Legacy alias.
// AMD CPUs.
-def : Proc<"k6", [FeatureMMX]>;
-def : Proc<"k6-2", [Feature3DNow]>;
-def : Proc<"k6-3", [Feature3DNow]>;
-def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA,
+ FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+ FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem,
+def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"amdfam10", [FeatureSSE4A,
- Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowBTMem,
+def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
FeatureSlowSHLD]>;
-def : Proc<"barcelona", [FeatureSSE4A,
- Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowBTMem,
+def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
FeatureSlowSHLD]>;
+def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+ FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+ FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+
// Bobcat
-def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
- FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
- FeatureSlowSHLD]>;
+def : Proc<"btver1", [
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
// Jaguar
-def : ProcessorModel<"btver2", BtVer2Model,
- [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
- FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
- FeatureBMI, FeatureF16C, FeatureMOVBE,
- FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
- FeatureSlowSHLD]>;
-
-// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
+def : ProcessorModel<"btver2", BtVer2Model, [
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureAES,
+ FeaturePCLMUL,
+ FeatureBMI,
+ FeatureF16C,
+ FeatureMOVBE,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
// Bulldozer
-def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
- FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
- FeatureAVX, FeatureSSE4A, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowSHLD]>;
+def : Proc<"bdver1", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
// Piledriver
-def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
- FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
- FeatureAVX, FeatureSSE4A, FeatureF16C,
- FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
- FeatureTBM, FeatureFMA, FeatureSlowSHLD]>;
+def : Proc<"bdver2", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF
+]>;
// Steamroller
-def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
- FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
- FeatureAVX, FeatureSSE4A, FeatureF16C,
- FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
- FeatureTBM, FeatureFMA, FeatureSlowSHLD,
- FeatureFSGSBase]>;
+def : Proc<"bdver3", [
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureAVX,
+ FeatureFXSR,
+ FeatureSSE4A,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureSlowSHLD,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
// Excavator
-def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4,
- FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
- FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
- FeaturePOPCNT, FeatureBMI, FeatureBMI2,
- FeatureTBM, FeatureFMA, FeatureSSE4A,
- FeatureFSGSBase]>;
-
-def : Proc<"geode", [Feature3DNowA]>;
-
-def : Proc<"winchip-c6", [FeatureMMX]>;
-def : Proc<"winchip2", [Feature3DNow]>;
-def : Proc<"c3", [Feature3DNow]>;
-def : Proc<"c3-2", [FeatureSSE1]>;
+def : Proc<"bdver4", [
+ FeatureMMX,
+ FeatureAVX2,
+ FeatureFXSR,
+ FeatureXOP,
+ FeatureFMA4,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureF16C,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureTBM,
+ FeatureFMA,
+ FeatureXSAVEOPT,
+ FeatureFSGSBase,
+ FeatureLAHFSAHF
+]>;
+
+def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -492,8 +710,8 @@ def : Proc<"c3-2", [FeatureSSE1]>;
// knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
def : ProcessorModel<"x86-64", SandyBridgeModel,
- [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
- FeatureFastUAMem]>;
+ [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit,
+ FeatureSlowBTMem ]>;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -520,10 +738,6 @@ include "X86CallingConv.td"
// Assembly Parser
//===----------------------------------------------------------------------===//
-def ATTAsmParser : AsmParser {
- string AsmParserClassName = "AsmParser";
-}
-
def ATTAsmParserVariant : AsmParserVariant {
int Variant = 0;
@@ -568,7 +782,6 @@ def IntelAsmWriter : AsmWriter {
def X86 : Target {
// Information about the instructions...
let InstructionSet = X86InstrInfo;
- let AssemblyParsers = [ATTAsmParser];
let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
}
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index ba33248d2039..2170e62e30fd 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -217,10 +217,10 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
if (AsmVariant == 0) O << '%';
unsigned Reg = MO.getReg();
if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
- MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
- MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
- ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
- Reg = getX86SubSuperRegister(Reg, VT);
+ unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+ (strcmp(Modifier+6,"32") == 0) ? 32 :
+ (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+ Reg = getX86SubSuperRegister(Reg, Size);
}
O << X86ATTInstPrinter::getRegisterName(Reg);
return;
@@ -361,22 +361,21 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
switch (Mode) {
default: return true; // Unknown mode.
case 'b': // Print QImode register
- Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ Reg = getX86SubSuperRegister(Reg, 8);
break;
case 'h': // Print QImode high register
- Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ Reg = getX86SubSuperRegister(Reg, 8, true);
break;
case 'w': // Print HImode register
- Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ Reg = getX86SubSuperRegister(Reg, 16);
break;
case 'k': // Print SImode register
- Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ Reg = getX86SubSuperRegister(Reg, 32);
break;
case 'q':
// Print 64-bit register names if 64-bit integer registers are available.
// Otherwise, print 32-bit register names.
- MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32;
- Reg = getX86SubSuperRegister(Reg, Ty);
+ Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
break;
}
@@ -535,6 +534,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
}
}
+ OutStreamer->EmitSyntaxDirective();
}
static void
@@ -565,10 +565,11 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
const MachineConstantPoolEntry &CPE =
MF->getConstantPool()->getConstants()[CPID];
if (!CPE.isMachineConstantPoolEntry()) {
- SectionKind Kind = CPE.getSectionKind(TM.getDataLayout());
+ const DataLayout &DL = MF->getDataLayout();
+ SectionKind Kind = CPE.getSectionKind(&DL);
const Constant *C = CPE.Val.ConstVal;
if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
- getObjFileLowering().getSectionForConstant(Kind, C))) {
+ getObjFileLowering().getSectionForConstant(DL, Kind, C))) {
if (MCSymbol *Sym = S->getCOMDATSymbol()) {
if (Sym->isUndefined())
OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 7f5d127c68d5..9c8bd98dbade 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
// outputting it to the OutStream. This allows the shadow tracker to minimise
// the number of NOPs used for stackmap padding.
void EmitAndCountInstruction(MCInst &Inst);
-
- void InsertStackMapShadows(MachineFunction &MF);
void LowerSTACKMAP(const MachineInstr &MI);
void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 031ba4ba9e66..fc6ee1752f1f 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
@@ -53,10 +54,13 @@ private:
// Information we know about a particular call site
struct CallContext {
CallContext()
- : Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
- MovVector(4, nullptr), NoStackParams(false), UsePush(false){};
+ : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+ MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
- // Actuall call instruction
+ // Iterator referring to the frame setup instruction
+ MachineBasicBlock::iterator FrameSetup;
+
+ // Actual call instruction
MachineInstr *Call;
// A copy of the stack pointer
@@ -75,17 +79,16 @@ private:
bool UsePush;
};
- typedef DenseMap<MachineInstr *, CallContext> ContextMap;
+ typedef SmallVector<CallContext, 8> ContextVector;
bool isLegal(MachineFunction &MF);
- bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap);
+ bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, CallContext &Context);
- bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I,
- const CallContext &Context);
+ bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
unsigned Reg);
@@ -100,7 +103,8 @@ private:
const char *getPassName() const override { return "X86 Optimize Call Frame"; }
const TargetInstrInfo *TII;
- const TargetFrameLowering *TFL;
+ const X86FrameLowering *TFL;
+ const X86Subtarget *STI;
const MachineRegisterInfo *MRI;
static char ID;
};
@@ -124,8 +128,15 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// No point in running this in 64-bit mode, since some arguments are
// passed in-register in all common calling conventions, so the pattern
// we're looking for will never match.
- const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
- if (STI.is64Bit())
+ if (STI->is64Bit())
+ return false;
+
+ // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+ // in the compact unwind encoding that Darwin uses. So, bail if there
+ // is a danger of that being generated.
+ if (STI->isTargetDarwin() &&
+ (!MF.getMMI().getLandingPads().empty() ||
+ (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
return false;
// You would expect straight-line code between call-frame setup and
@@ -161,7 +172,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// Check whether this trasnformation is profitable for a particular
// function - in terms of code size.
bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
- ContextMap &CallSeqMap) {
+ ContextVector &CallSeqVector) {
// This transformation is always a win when we do not expect to have
// a reserved call frame. Under other circumstances, it may be either
// a win or a loss, and requires a heuristic.
@@ -170,24 +181,20 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
return true;
// Don't do this when not optimizing for size.
- bool OptForSize =
- MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
- MF.getFunction()->hasFnAttribute(Attribute::MinSize);
-
- if (!OptForSize)
+ if (!MF.getFunction()->optForSize())
return false;
unsigned StackAlign = TFL->getStackAlignment();
int64_t Advantage = 0;
- for (auto CC : CallSeqMap) {
+ for (auto CC : CallSeqVector) {
// Call sites where no parameters are passed on the stack
// do not affect the cost, since there needs to be no
// stack adjustment.
- if (CC.second.NoStackParams)
+ if (CC.NoStackParams)
continue;
- if (!CC.second.UsePush) {
+ if (!CC.UsePush) {
// If we don't use pushes for a particular call site,
// we pay for not having a reserved call frame with an
// additional sub/add esp pair. The cost is ~3 bytes per instruction,
@@ -200,11 +207,11 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
// We'll need a add after the call.
Advantage -= 3;
// If we have to realign the stack, we'll also need and sub before
- if (CC.second.ExpectedDist % StackAlign)
+ if (CC.ExpectedDist % StackAlign)
Advantage -= 3;
// Now, for each push, we save ~3 bytes. For small constants, we actually,
// save more (up to 5 bytes), but 3 should be a good approximation.
- Advantage += (CC.second.ExpectedDist / 4) * 3;
+ Advantage += (CC.ExpectedDist / 4) * 3;
}
}
@@ -212,8 +219,9 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
}
bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
- TII = MF.getSubtarget().getInstrInfo();
- TFL = MF.getSubtarget().getFrameLowering();
+ STI = &MF.getSubtarget<X86Subtarget>();
+ TII = STI->getInstrInfo();
+ TFL = STI->getFrameLowering();
MRI = &MF.getRegInfo();
if (!isLegal(MF))
@@ -223,21 +231,22 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
- ContextMap CallSeqMap;
+ ContextVector CallSeqVector;
for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
if (I->getOpcode() == FrameSetupOpcode) {
- CallContext &Context = CallSeqMap[I];
+ CallContext Context;
collectCallInfo(MF, *BB, I, Context);
+ CallSeqVector.push_back(Context);
}
- if (!isProfitable(MF, CallSeqMap))
+ if (!isProfitable(MF, CallSeqVector))
return false;
- for (auto CC : CallSeqMap)
- if (CC.second.UsePush)
- Changed |= adjustCallSequence(MF, CC.first, CC.second);
+ for (auto CC : CallSeqVector)
+ if (CC.UsePush)
+ Changed |= adjustCallSequence(MF, CC);
return Changed;
}
@@ -307,13 +316,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
// Check that this particular call sequence is amenable to the
// transformation.
const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- unsigned StackPtr = RegInfo.getStackRegister();
+ STI->getRegisterInfo());
unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
// We expect to enter this at the beginning of a call sequence
assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
MachineBasicBlock::iterator FrameSetup = I++;
+ Context.FrameSetup = FrameSetup;
// How much do we adjust the stack? This puts an upper bound on
// the number of parameters actually passed on it.
@@ -338,7 +347,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
if (!I->isCopy() || !I->getOperand(0).isReg())
return;
Context.SPCopy = I++;
- StackPtr = Context.SPCopy->getOperand(0).getReg();
+
+ unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
// Scan the call setup sequence for the pattern we're looking for.
// We only handle a simple case - a sequence of MOV32mi or MOV32mr
@@ -434,22 +444,22 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
}
bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
- MachineBasicBlock::iterator I,
const CallContext &Context) {
// Ok, we can in fact do the transformation for this call.
// Do not remove the FrameSetup instruction, but adjust the parameters.
// PEI will end up finalizing the handling of this.
- MachineBasicBlock::iterator FrameSetup = I;
- MachineBasicBlock &MBB = *(I->getParent());
+ MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+ MachineBasicBlock &MBB = *(FrameSetup->getParent());
FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
- DebugLoc DL = I->getDebugLoc();
+ DebugLoc DL = FrameSetup->getDebugLoc();
// Now, iterate through the vector in reverse order, and replace the movs
// with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
// replace uses.
for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ MachineBasicBlock::iterator Push = nullptr;
if (MOV->getOpcode() == X86::MOV32mi) {
unsigned PushOpcode = X86::PUSHi32;
// If the operand is a small (8-bit) immediate, we can use a
@@ -461,21 +471,20 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
if (isInt<8>(Val))
PushOpcode = X86::PUSH32i8;
}
- BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+ .addOperand(PushOp);
} else {
unsigned int Reg = PushOp.getReg();
// If PUSHrmm is not slow on this target, try to fold the source of the
// push into the instruction.
- const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+ bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
// Check that this is legal to fold. Right now, we're extremely
// conservative about that.
MachineInstr *DefMov = nullptr;
if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
- MachineInstr *Push =
- BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
unsigned NumOps = DefMov->getDesc().getNumOperands();
for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -483,12 +492,19 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
DefMov->eraseFromParent();
} else {
- BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+ Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
.addReg(Reg)
.getInstr();
}
}
+ // For debugging, when using SP-based CFA, we need to adjust the CFA
+ // offset after each push.
+ // TODO: This is needed only if we require precise CFA.
+ if (!TFL->hasFP(MF))
+ TFL->BuildCFI(MBB, std::next(Push), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+
MBB.erase(MOV);
}
@@ -532,13 +548,10 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
DefMI->getParent() != FrameSetup->getParent())
return nullptr;
- // Now, make sure everything else up until the ADJCALLSTACK is a sequence
- // of MOVs. To be less conservative would require duplicating a lot of the
- // logic from PeepholeOptimizer.
- // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
- // to be smarter about folding into pushes.
+ // Make sure we don't have any instructions between DefMI and the
+ // push that make folding the load illegal.
for (auto I = DefMI; I != FrameSetup; ++I)
- if (I->getOpcode() != X86::MOV32rm)
+ if (I->isLoadFoldBarrier())
return nullptr;
return DefMI;
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index 0eb2494f1d63..a08160f9feba 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/IR/CallingConv.h"
@@ -42,6 +43,64 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
return false;
}
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+ // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+ // not to split i64 and double between a register and stack
+ static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+ static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // If this is the first part of an double/i64/i128, or if we're already
+ // in the middle of a split, add to the pending list. If this is not
+ // the end of the split, return, otherwise go on to process the pending
+ // list
+ if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+ if (!ArgFlags.isSplitEnd())
+ return true;
+ }
+
+ // If there are no pending members, we are not in the middle of a split,
+ // so do the usual inreg stuff.
+ if (PendingMembers.empty()) {
+ if (unsigned Reg = State.AllocateReg(RegList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+ }
+
+ assert(ArgFlags.isSplitEnd());
+
+ // We now have the entire original argument in PendingMembers, so decide
+ // whether to use registers or the stack.
+ // Per the MCU ABI:
+ // a) To use registers, we need to have enough of them free to contain
+ // the entire argument.
+ // b) We never want to use more than 2 registers for a single argument.
+
+ unsigned FirstFree = State.getFirstUnallocated(RegList);
+ bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+ for (auto &It : PendingMembers) {
+ if (UseRegs)
+ It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+ else
+ It.convertToMem(State.AllocateStack(4, 4));
+ State.addLoc(It);
+ }
+
+ PendingMembers.clear();
+
+ return true;
+}
+
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 8f88888f5ce3..54d88cbb244e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -158,6 +158,7 @@ def RetCC_X86_64_C : CallingConv<[
// The X86-64 calling convention always returns FP values in XMM0.
CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
// MMX vector types are always returned in XMM0.
CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
@@ -202,6 +203,16 @@ def RetCC_X86_64_AnyReg : CallingConv<[
CCCustom<"CC_X86_AnyReg_Error">
]>;
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+ // Promote all types to i64
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Return: could return in any GP register save RSP and R12.
+ CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14, R15]>>
+]>;
+
// This is the root return-value convention for the X86-32 backend.
def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
@@ -227,6 +238,9 @@ def RetCC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+ // Handle HHVM calls.
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
@@ -280,7 +294,7 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[v64i1], CCPromoteToType<v64i8>>,
// The first 8 FP/Vector arguments are passed in XMM registers.
- CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfSubtarget<"hasSSE1()",
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
@@ -305,7 +319,7 @@ def CC_X86_64_C : CallingConv<[
// Long doubles get stack slots whose size and alignment depends on the
// subtarget.
- CCIfType<[f80], CCAssignToStack<0, 0>>,
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
// Vectors get 16-byte stack slots that are 16-byte aligned.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
@@ -319,6 +333,23 @@ def CC_X86_64_C : CallingConv<[
CCAssignToStack<64, 64>>
]>;
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+ // Use all/any GP registers for args, except RSP.
+ CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+ RDI, RSI, RDX, RCX, R8, R9,
+ RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+ // Pass the first argument in RBP.
+ CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+ // Otherwise it's the same as the regular C calling convention.
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
// Calling convention used on Win64
def CC_X86_Win64_C : CallingConv<[
// FIXME: Handle byval stuff.
@@ -561,6 +592,23 @@ def CC_X86_32_C : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
+def CC_X86_32_MCU : CallingConv<[
+ // Handles byval parameters. Note that, like FastCC, we can't rely on
+ // the delegation to CC_X86_32_Common because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // If the call is not a vararg call, some arguments may be passed
+ // in integer registers.
+ CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
def CC_X86_32_FastCall : CallingConv<[
// Promote i1/i8/i16 arguments to i32.
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -708,18 +756,28 @@ def CC_Intel_OCL_BI : CallingConv<[
CCDelegateTo<CC_X86_32_C>
]>;
+def CC_X86_32_Intr : CallingConv<[
+ CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+ CCAssignToStack<8, 8>
+]>;
+
//===----------------------------------------------------------------------===//
// X86 Root Argument Calling Conventions
//===----------------------------------------------------------------------===//
// This is the root argument convention for the X86-32 backend.
def CC_X86_32 : CallingConv<[
+ CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
// Otherwise, drop to normal X86-32 CC
CCDelegateTo<CC_X86_32_C>
@@ -734,6 +792,9 @@ def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+ CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+ CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -764,6 +825,12 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
(sequence "XMM%u", 6, 15))>;
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+ R8, R9, R10, R11)>;
+
// All GPRs - except r11
def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
R8, R9, R10, RSP)>;
@@ -778,6 +845,11 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
R11, R12, R13, R14, R15, RBP,
(sequence "XMM%u", 0, 15))>;
+def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+ EDI, ESP)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+ (sequence "XMM%u", 0, 7))>;
+
def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
(sequence "XMM%u", 16, 31))>;
def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
@@ -804,3 +876,6 @@ def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
(sequence "ZMM%u", 16, 31),
K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
deleted file mode 100644
index 69b4c71651d7..000000000000
--- a/lib/Target/X86/X86CompilationCallback_Win64.asm
+++ /dev/null
@@ -1,68 +0,0 @@
-;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===
-;;
-;; The LLVM Compiler Infrastructure
-;;
-;; This file is distributed under the University of Illinois Open Source
-;; License. See LICENSE.TXT for details.
-;;
-;;===----------------------------------------------------------------------===
-;;
-;; This file implements the JIT interfaces for the X86 target.
-;;
-;;===----------------------------------------------------------------------===
-
-extrn LLVMX86CompilationCallback2: PROC
-
-.code
-X86CompilationCallback proc
- push rbp
-
- ; Save RSP.
- mov rbp, rsp
-
- ; Save all int arg registers
- ; WARNING: We cannot use register spill area - we're generating stubs by hands!
- push rcx
- push rdx
- push r8
- push r9
-
- ; Align stack on 16-byte boundary.
- and rsp, -16
-
- ; Save all XMM arg registers. Also allocate reg spill area.
- sub rsp, 96
- movaps [rsp +32], xmm0
- movaps [rsp+16+32], xmm1
- movaps [rsp+32+32], xmm2
- movaps [rsp+48+32], xmm3
-
- ; JIT callee
-
- ; Pass prev frame and return address.
- mov rcx, rbp
- mov rdx, qword ptr [rbp+8]
- call LLVMX86CompilationCallback2
-
- ; Restore all XMM arg registers.
- movaps xmm3, [rsp+48+32]
- movaps xmm2, [rsp+32+32]
- movaps xmm1, [rsp+16+32]
- movaps xmm0, [rsp +32]
-
- ; Restore RSP.
- mov rsp, rbp
-
- ; Restore all int arg registers
- sub rsp, 32
- pop r9
- pop r8
- pop rdx
- pop rcx
-
- ; Restore RBP.
- pop rbp
- ret
-X86CompilationCallback endp
-
-End
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 6a5a28e546f2..a09d06519376 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,9 +19,10 @@
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
-#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
#include "llvm/IR/GlobalValue.h"
using namespace llvm;
@@ -141,6 +142,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// The EH_RETURN pseudo is really removed during the MC Lowering.
return true;
}
+ case X86::IRET: {
+ // Adjust stack to erase error code
+ int64_t StackAdj = MBBI->getOperand(0).getImm();
+ X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+ // Replace pseudo with machine iret
+ BuildMI(MBB, MBBI, DL,
+ TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::EH_RESTORE: {
+ // Restore ESP and EBP, and optionally ESI if required.
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+ MBB.getParent()->getFunction()->getPersonalityFn()));
+ X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+ MBBI->eraseFromParent();
+ return true;
+ }
}
llvm_unreachable("Previous switch has a fallthrough?");
}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index b4319c8bb04f..de94a138d865 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -298,8 +298,8 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
return false;
// Make sure nothing is in the way
- BasicBlock::const_iterator Start = I;
- BasicBlock::const_iterator End = II;
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
for (auto Itr = std::prev(Start); Itr != End; --Itr) {
// We only expect extractvalue instructions between the intrinsic and the
// instruction to be selected.
@@ -433,6 +433,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
X86AddressMode &AM,
MachineMemOperand *MMO, bool Aligned) {
+ bool HasSSE2 = Subtarget->hasSSE2();
+ bool HasSSE4A = Subtarget->hasSSE4A();
+ bool HasAVX = Subtarget->hasAVX();
+ bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
// Get opcode and regclass of the output for the given store instruction.
unsigned Opc = 0;
switch (VT.getSimpleVT().SimpleTy) {
@@ -449,35 +454,59 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
// FALLTHROUGH, handling i1 as i8.
case MVT::i8: Opc = X86::MOV8mr; break;
case MVT::i16: Opc = X86::MOV16mr; break;
- case MVT::i32: Opc = X86::MOV32mr; break;
- case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+ case MVT::i32:
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+ break;
case MVT::f32:
- Opc = X86ScalarSSEf32 ?
- (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSS;
+ else
+ Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ } else
+ Opc = X86::ST_Fp32m;
break;
case MVT::f64:
- Opc = X86ScalarSSEf64 ?
- (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
+ if (X86ScalarSSEf32) {
+ if (IsNonTemporal && HasSSE4A)
+ Opc = X86::MOVNTSD;
+ else
+ Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ } else
+ Opc = X86::ST_Fp64m;
break;
case MVT::v4f32:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
- else
- Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ else
+ Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ } else
+ Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
- else
- Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ else
+ Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ } else
+ Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v8i16:
case MVT::v16i8:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
- else
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ else
+ Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ } else
Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
}
@@ -1069,12 +1098,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
RetRegs.push_back(VA.getLocReg());
}
- // The x86-64 ABI for returning structs by value requires that we copy
- // the sret argument into %rax for the return. We saved the argument into
- // a virtual register in the entry block, so now we copy the value out
- // and into %rax. We also do the same with %eax for Win32.
- if (F.hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
+ if (F.hasStructRetAttr()) {
unsigned Reg = X86MFInfo->getSRetReturnReg();
assert(Reg &&
"SRetReturnReg should have been set in LowerFormalArguments()!");
@@ -1431,17 +1459,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
.addMBB(TrueMBB);
}
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
-
- // Emits an unconditional branch to the FalseBB, obtains the branch
- // weight, and adds it to the successor list.
- fastEmitBranch(FalseMBB, DbgLoc);
-
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1472,12 +1490,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
.addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
}
@@ -1492,12 +1506,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
.addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -1511,12 +1520,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
.addReg(OpReg).addImm(1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
.addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+ finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
return true;
}
@@ -1945,6 +1949,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
unsigned ResultReg;
if (Subtarget->hasAVX()) {
+ const TargetRegisterClass *FR32 = &X86::FR32RegClass;
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
// If we have AVX, create 1 blendv instead of 3 logic instructions.
// Blendv was introduced with SSE 4.1, but the 2 register form implicitly
// uses XMM0 as the selection register. That may need just as many
@@ -1955,10 +1962,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
unsigned BlendOpcode =
(RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
- unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill, CmpReg, true);
+ unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+ LHSReg, LHSIsKill, CmpReg, true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
} else {
unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
@@ -2806,10 +2816,12 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::HiPE)
return 0;
- if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
- return 0;
- if (CS && CS->paramHasAttr(1, Attribute::InReg))
- return 0;
+
+ if (CS)
+ if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+ CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+ return 0;
+
return 4;
}
@@ -2924,7 +2936,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
@@ -3020,8 +3032,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
- ArgVT.getStoreSize(), Alignment);
+ MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
if (Flags.isByVal()) {
X86AddressMode SrcAM;
SrcAM.Base.Reg = ArgReg;
@@ -3252,6 +3264,30 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
updateValueMap(I, Reg);
return true;
}
+ case Instruction::BitCast: {
+ // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+ if (!Subtarget->hasSSE2())
+ return false;
+
+ EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+
+ if (!SrcVT.isSimple() || !DstVT.isSimple())
+ return false;
+
+ if (!SrcVT.is128BitVector() &&
+ !(Subtarget->hasAVX() && SrcVT.is256BitVector()))
+ return false;
+
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0)
+ return false;
+
+ // No instruction is needed for conversion. Reuse the register used by
+ // the fist operand.
+ updateValueMap(I, Reg);
+ return true;
+ }
}
return false;
@@ -3384,8 +3420,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
TII.get(Opc), ResultReg);
addDirectMem(MIB, AddrReg);
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
- DL.getPointerSize(), Align);
+ MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+ MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
MIB->addMemOperand(*FuncInfo.MF, MMO);
return ResultReg;
}
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 5eb4faeedff4..1dd69e8a6a5f 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -9,6 +9,7 @@
//
// This file defines the pass that finds instructions that can be
// re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
//
//===----------------------------------------------------------------------===//
@@ -61,6 +62,11 @@ class FixupLEAPass : public MachineFunctionPass {
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
+ /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+ /// and convert them to INC or DEC respectively.
+ bool fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const;
+
/// \brief Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
@@ -89,6 +95,8 @@ public:
private:
MachineFunction *MF;
const X86InstrInfo *TII; // Machine instruction info.
+ bool OptIncDec;
+ bool OptLEA;
};
char FixupLEAPass::ID = 0;
}
@@ -150,7 +158,10 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
- if (!ST.LEAusesAG() && !ST.slowLEA())
+ OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+ if (!OptLEA && !OptIncDec)
return false;
TII = ST.getInstrInfo();
@@ -187,7 +198,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
if (I == MFI->begin()) {
- if (MFI->isPredecessor(MFI)) {
+ if (MFI->isPredecessor(&*MFI)) {
I = --MFI->end();
return true;
} else
@@ -222,6 +233,60 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return nullptr;
}
+static inline bool isLEA(const int opcode) {
+ return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+ opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea %reg, 1(%reg)
+/// lea %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) {
+ unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg();
+ unsigned DstReg = LEA->getOperand(0).getReg();
+ unsigned AddrDispOp = 1 + X86::AddrDisp;
+ return SrcReg == DstReg &&
+ LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+ LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+ LEA->getOperand(AddrDispOp).isImm() &&
+ (LEA->getOperand(AddrDispOp).getImm() == 1 ||
+ LEA->getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+ MachineFunction::iterator MFI) const {
+ MachineInstr *MI = I;
+ int Opcode = MI->getOpcode();
+ if (!isLEA(Opcode))
+ return false;
+
+ if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+ int NewOpcode;
+ bool isINC = MI->getOperand(4).getImm() == 1;
+ switch (Opcode) {
+ case X86::LEA16r:
+ NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+ break;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+ break;
+ case X86::LEA64r:
+ NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+ break;
+ }
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1));
+ MFI->erase(I);
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ return true;
+ }
+ return false;
+}
+
void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
// Process a load, store, or LEA instruction.
@@ -265,8 +330,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineInstr *MI = I;
const int opcode = MI->getOpcode();
- if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
- opcode != X86::LEA64_32r)
+ if (!isLEA(opcode))
return;
if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
!TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -280,7 +344,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
int addrr_opcode, addri_opcode;
switch (opcode) {
- default: llvm_unreachable("Unexpected LEA instruction");
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
case X86::LEA16r:
addrr_opcode = X86::ADD16rr;
addri_opcode = X86::ADD16ri;
@@ -330,10 +395,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
MachineFunction::iterator MFI) {
for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
- if (MF.getSubtarget<X86Subtarget>().isSLM())
- processInstructionForSLM(I, MFI);
- else
- processInstruction(I, MFI);
+ if (OptIncDec)
+ if (fixupIncDec(I, MFI))
+ continue;
+
+ if (OptLEA) {
+ if (MF.getSubtarget<X86Subtarget>().isSLM())
+ processInstructionForSLM(I, MFI);
+ else
+ processInstruction(I, MFI);
+ }
}
return false;
}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 40b9c8a863a3..97bb8ab653a6 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -120,12 +120,10 @@ namespace {
// Return a bitmask of FP registers in block's live-in list.
static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
unsigned Mask = 0;
- for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
- E = MBB->livein_end(); I != E; ++I) {
- unsigned Reg = *I;
- if (Reg < X86::FP0 || Reg > X86::FP6)
+ for (const auto &LI : MBB->liveins()) {
+ if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
continue;
- Mask |= 1 << (Reg - X86::FP0);
+ Mask |= 1 << (LI.PhysReg - X86::FP0);
}
return Mask;
}
@@ -301,8 +299,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
bool FPIsUsed = false;
static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned i = 0; i <= 6; ++i)
- if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+ if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
FPIsUsed = true;
break;
}
@@ -321,7 +320,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// Process the function in depth first order so that we process at least one
// of the predecessors for every reachable block in the function.
SmallPtrSet<MachineBasicBlock*, 8> Processed;
- MachineBasicBlock *Entry = MF.begin();
+ MachineBasicBlock *Entry = &MF.front();
bool Changed = false;
for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
@@ -329,9 +328,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// Process any unreachable blocks in arbitrary order now.
if (MF.size() != Processed.size())
- for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
- if (Processed.insert(BB).second)
- Changed |= processBasicBlock(MF, *BB);
+ for (MachineBasicBlock &BB : MF)
+ if (Processed.insert(&BB).second)
+ Changed |= processBasicBlock(MF, BB);
LiveBundles.clear();
@@ -348,13 +347,12 @@ void FPS::bundleCFG(MachineFunction &MF) {
LiveBundles.resize(Bundles->getNumBundles());
// Gather the actual live-in masks for all MBBs.
- for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
- MachineBasicBlock *MBB = I;
- const unsigned Mask = calcLiveInMask(MBB);
+ for (MachineBasicBlock &MBB : MF) {
+ const unsigned Mask = calcLiveInMask(&MBB);
if (!Mask)
continue;
// Update MBB ingoing bundle mask.
- LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask;
+ LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
}
}
@@ -546,17 +544,9 @@ namespace {
};
}
-#ifndef NDEBUG
-static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
- for (unsigned i = 0; i != NumEntries-1; ++i)
- if (!(Table[i] < Table[i+1])) return false;
- return true;
-}
-#endif
-
-static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
- const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
- if (I != Table+N && I->from == Opcode)
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+ if (I != Table.end() && I->from == Opcode)
return I->to;
return -1;
}
@@ -567,7 +557,7 @@ static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
#define ASSERT_SORTED(TABLE) \
{ static bool TABLE##Checked = false; \
if (!TABLE##Checked) { \
- assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \
+ assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \
"All lookup tables must be sorted for efficient access!"); \
TABLE##Checked = true; \
} \
@@ -746,7 +736,7 @@ static const TableEntry OpcodeTable[] = {
static unsigned getConcreteOpcode(unsigned Opcode) {
ASSERT_SORTED(OpcodeTable);
- int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+ int Opc = Lookup(OpcodeTable, Opcode);
assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
return Opc;
}
@@ -797,7 +787,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
RegMap[Stack[--StackTop]] = ~0; // Update state
// Check to see if there is a popping version of this instruction...
- int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+ int Opcode = Lookup(PopTable, I->getOpcode());
if (Opcode != -1) {
I->setDesc(TII->get(Opcode));
if (Opcode == X86::UCOM_FPPr)
@@ -1193,7 +1183,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
// We decide which form to use based on what is on the top of the stack, and
// which operand is killed by this instruction.
- const TableEntry *InstTable;
+ ArrayRef<TableEntry> InstTable;
bool isForward = TOS == Op0;
bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
if (updateST0) {
@@ -1208,8 +1198,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
InstTable = ReverseSTiTable;
}
- int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
- MI->getOpcode());
+ int Opcode = Lookup(InstTable, MI->getOpcode());
assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
// NotTOS - The register which is not on the top of stack...
@@ -1520,31 +1509,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
return;
}
- case X86::WIN_FTOL_32:
- case X86::WIN_FTOL_64: {
- // Push the operand into ST0.
- MachineOperand &Op = MI->getOperand(0);
- assert(Op.isUse() && Op.isReg() &&
- Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
- unsigned FPReg = getFPReg(Op);
- if (Op.isKill())
- moveToTop(FPReg, Inst);
- else
- duplicateToTop(FPReg, ScratchFPReg, Inst);
-
- // Emit the call. This will pop the operand.
- BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
- .addExternalSymbol("_ftol2")
- .addReg(X86::ST0, RegState::ImplicitKill)
- .addReg(X86::ECX, RegState::ImplicitDefine)
- .addReg(X86::EAX, RegState::Define | RegState::Implicit)
- .addReg(X86::EDX, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- --StackTop;
-
- break;
- }
-
case X86::RETQ:
case X86::RETL:
case X86::RETIL:
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 3a21b57f0157..242d0333ef9a 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -18,25 +18,23 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Support/Debug.h"
#include <cstdlib>
using namespace llvm;
-// FIXME: completely move here.
-extern cl::opt<bool> ForceStackAlign;
-
X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
unsigned StackAlignOverride)
: TargetFrameLowering(StackGrowsDown, StackAlignOverride,
@@ -80,6 +78,27 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
+/// usesTheStack - This function checks if any of the users of EFLAGS
+/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
+/// to use the stack, and if we don't adjust the stack we clobber the first
+/// frame index.
+/// See X86InstrInfo::copyPhysReg.
+static bool usesTheStack(const MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Conservativley assume that inline assembly might use the stack.
+ if (MF.hasInlineAsm())
+ return true;
+
+ return any_of(MRI.reg_instructions(X86::EFLAGS),
+ [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
+static bool doesStackUseImplyFP(const MachineFunction &MF) {
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ return IsWin64Prologue && usesTheStack(MF);
+}
+
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register. This is true if the function has variable sized allocas
/// or if frame pointer elimination is disabled.
@@ -92,8 +111,9 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
MFI->hasVarSizedObjects() ||
MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
- MMI.callsUnwindInit() || MMI.callsEHReturn() ||
- MFI->hasStackMap() || MFI->hasPatchPoint());
+ MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
+ MFI->hasStackMap() || MFI->hasPatchPoint() ||
+ doesStackUseImplyFP(MF));
}
static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
@@ -148,21 +168,14 @@ static unsigned getLEArOpcode(unsigned IsLP64) {
/// to this register without worry about clobbering it.
static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const TargetRegisterInfo *TRI,
+ const X86RegisterInfo *TRI,
bool Is64Bit) {
const MachineFunction *MF = MBB.getParent();
const Function *F = MF->getFunction();
if (!F || MF->getMMI().callsEHReturn())
return 0;
- static const uint16_t CallerSavedRegs32Bit[] = {
- X86::EAX, X86::EDX, X86::ECX, 0
- };
-
- static const uint16_t CallerSavedRegs64Bit[] = {
- X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
- X86::R8, X86::R9, X86::R10, X86::R11, 0
- };
+ const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
unsigned Opc = MBBI->getOpcode();
switch (Opc) {
@@ -191,10 +204,9 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
Uses.insert(*AI);
}
- const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
- for (; *CS; ++CS)
- if (!Uses.count(*CS))
- return *CS;
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP)
+ return CS;
}
}
@@ -214,8 +226,12 @@ static bool isEAXLiveIn(MachineFunction &MF) {
return false;
}
-/// Check whether or not the terminators of \p MBB needs to read EFLAGS.
-static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
for (const MachineInstr &MI : MBB.terminators()) {
bool BreakNext = false;
for (const MachineOperand &MO : MI.operands()) {
@@ -225,15 +241,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
if (Reg != X86::EFLAGS)
continue;
- // This terminator needs an eflag that is not defined
- // by a previous terminator.
+ // This terminator needs an eflags that is not defined
+ // by a previous another terminator:
+ // EFLAGS is live-in of the region composed by the terminators.
if (!MO.isDef())
return true;
+ // This terminator defines the eflags, i.e., we don't need to preserve it.
+ // However, we still need to check this specific terminator does not
+ // read a live-in value.
BreakNext = true;
}
+ // We found a definition of the eflags, no need to preserve them.
if (BreakNext)
- break;
+ return false;
}
+
+ // None of the terminators use or define the eflags.
+ // Check if they are live-out, that would imply we need to preserve them.
+ for (const MachineBasicBlock *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ return true;
+
return false;
}
@@ -289,6 +317,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
.addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
if (isSub)
MI->setFlag(MachineInstr::FrameSetup);
+ else
+ MI->setFlag(MachineInstr::FrameDestroy);
Offset -= ThisVal;
continue;
}
@@ -298,6 +328,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
if (isSub)
MI.setMIFlag(MachineInstr::FrameSetup);
+ else
+ MI.setMIFlag(MachineInstr::FrameDestroy);
Offset -= ThisVal;
}
@@ -312,7 +344,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
// is tricky.
bool UseLEA;
if (!InEpilogue) {
- UseLEA = STI.useLeaForSP();
+ // Check if inserting the prologue at the beginning
+ // of MBB would require to use LEA operations.
+ // We need to use LEA operations if EFLAGS is live in, because
+ // it means an instruction will read it before it gets defined.
+ UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
} else {
// If we can use LEA for SP but we shouldn't, check that none
// of the terminators uses the eflags. Otherwise we will insert
@@ -321,10 +357,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
// and is an optimization anyway.
UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
if (UseLEA && !STI.useLeaForSP())
- UseLEA = terminatorsNeedFlagsAsInput(MBB);
+ UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
// If that assert breaks, that means we do not do the right thing
// in canUseAsEpilogue.
- assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) &&
+ assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
"We shouldn't have allowed this insertion point");
}
@@ -347,30 +383,6 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
return MI;
}
-/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
-static
-void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, uint64_t *NumBytes = nullptr) {
- if (MBBI == MBB.begin()) return;
-
- MachineBasicBlock::iterator PI = std::prev(MBBI);
- unsigned Opc = PI->getOpcode();
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
- Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
- PI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes += PI->getOperand(2).getImm();
- MBB.erase(PI);
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
- Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
- PI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes -= PI->getOperand(2).getImm();
- MBB.erase(PI);
- }
-}
-
int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
bool doMergeWithPrevious) const {
@@ -436,27 +448,265 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
}
}
-/// usesTheStack - This function checks if any of the users of EFLAGS
-/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
-/// to use the stack, and if we don't adjust the stack we clobber the first
-/// frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(const MachineFunction &MF) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL,
+ bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR()) {
+ if (InProlog) {
+ return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ } else {
+ return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ }
+ } else {
+ return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ }
+}
- for (MachineRegisterInfo::reg_instr_iterator
- ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
- ri != re; ++ri)
- if (ri->isCopy())
- return true;
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const {
+ const StringRef ChkStkStubSymbol = "__chkstk_stub";
+ MachineInstr *ChkStkStub = nullptr;
- return false;
+ for (MachineInstr &MI : PrologMBB) {
+ if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+ ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+ ChkStkStub = &MI;
+ break;
+ }
+ }
+
+ if (ChkStkStub != nullptr) {
+ MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+ assert(std::prev(MBBI).operator==(ChkStkStub) &&
+ "MBBI expected after __chkstk_stub.");
+ DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+ emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+ ChkStkStub->eraseFromParent();
+ }
}
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) const {
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ assert(STI.is64Bit() && "different expansion needed for 32 bit");
+ assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ // RAX contains the number of bytes of desired stack adjustment.
+ // The handling here assumes this value has already been updated so as to
+ // maintain stack alignment.
+ //
+ // We need to exit with RSP modified by this amount and execute suitable
+ // page touches to notify the OS that we're growing the stack responsibly.
+ // All stack probing must be done without modifying RSP.
+ //
+ // MBB:
+ // SizeReg = RAX;
+ // ZeroReg = 0
+ // CopyReg = RSP
+ // Flags, TestReg = CopyReg - SizeReg
+ // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+ // LimitReg = gs magic thread env access
+ // if FinalReg >= LimitReg goto ContinueMBB
+ // RoundBB:
+ // RoundReg = page address of FinalReg
+ // LoopMBB:
+ // LoopReg = PHI(LimitReg,ProbeReg)
+ // ProbeReg = LoopReg - PageSize
+ // [ProbeReg] = 0
+ // if (ProbeReg > RoundReg) goto LoopMBB
+ // ContinueMBB:
+ // RSP = RSP - RAX
+ // [rest of original MBB]
+
+ // Set up the new basic blocks
+ MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+ MF.insert(MBBIter, RoundMBB);
+ MF.insert(MBBIter, LoopMBB);
+ MF.insert(MBBIter, ContinueMBB);
+
+ // Split MBB and move the tail portion down to ContinueMBB.
+ MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+ ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+ ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ // Some useful constants
+ const int64_t ThreadEnvironmentStackLimit = 0x10;
+ const int64_t PageSize = 0x1000;
+ const int64_t PageMask = ~(PageSize - 1);
+
+ // Registers we need. For the normal case we use virtual
+ // registers. For the prolog expansion we use RAX, RCX and RDX.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+ const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+ : MRI.createVirtualRegister(RegClass),
+ ZeroReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ CopyReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ TestReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ FinalReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ RoundedReg = InProlog ? (unsigned)X86::RDX
+ : MRI.createVirtualRegister(RegClass),
+ LimitReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ JoinReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass),
+ ProbeReg = InProlog ? (unsigned)X86::RCX
+ : MRI.createVirtualRegister(RegClass);
+
+ // SP-relative offsets where we can save RCX and RDX.
+ int64_t RCXShadowSlot = 0;
+ int64_t RDXShadowSlot = 0;
+
+ // If inlining in the prolog, save RCX and RDX.
+ // Future optimization: don't save or restore if not live in.
+ if (InProlog) {
+ // Compute the offsets. We need to account for things already
+ // pushed onto the stack at this point: return address, frame
+ // pointer (if used), and callee saves.
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+ const bool HasFP = hasFP(MF);
+ RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+ RDXShadowSlot = RCXShadowSlot + 8;
+ // Emit the saves.
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RCXShadowSlot)
+ .addReg(X86::RCX);
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RDXShadowSlot)
+ .addReg(X86::RDX);
+ } else {
+ // Not in the prolog. Copy RAX to a virtual reg.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+ }
+
+ // Add code to MBB to check for overflow and set the new target stack pointer
+ // to zero if so.
+ BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+ .addReg(ZeroReg, RegState::Undef)
+ .addReg(ZeroReg, RegState::Undef);
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+ BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+ .addReg(CopyReg)
+ .addReg(SizeReg);
+ BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ .addReg(TestReg)
+ .addReg(ZeroReg);
+
+ // FinalReg now holds final stack pointer value, or zero if
+ // allocation would overflow. Compare against the current stack
+ // limit from the thread environment block. Note this limit is the
+ // lowest touched page on the stack, not the point at which the OS
+ // will cause an overflow exception, so this is just an optimization
+ // to avoid unnecessarily touching pages that are below the current
+ // SP but already commited to the stack by the OS.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(ThreadEnvironmentStackLimit)
+ .addReg(X86::GS);
+ BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+ // Jump if the desired stack pointer is at or above the stack limit.
+ BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+ // Add code to roundMBB to round the final stack pointer to a page boundary.
+ BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+ .addReg(FinalReg)
+ .addImm(PageMask);
+ BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+ // LimitReg now holds the current stack limit, RoundedReg page-rounded
+ // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+ // and probe until we reach RoundedReg.
+ if (!InProlog) {
+ BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+ .addReg(LimitReg)
+ .addMBB(RoundMBB)
+ .addReg(ProbeReg)
+ .addMBB(LoopMBB);
+ }
+
+ addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+ false, -PageSize);
+
+ // Probe by storing a byte onto the stack.
+ BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+ .addReg(ProbeReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addImm(0);
+ BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+ .addReg(RoundedReg)
+ .addReg(ProbeReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+ MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+ // If in prolog, restore RDX and RCX.
+ if (InProlog) {
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RCX),
+ X86::RSP, false, RCXShadowSlot);
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RDX),
+ X86::RSP, false, RDXShadowSlot);
+ }
+
+ // Now that the probing is done, add code to continueMBB to update
+ // the stack pointer for real.
+ BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(SizeReg);
+
+ // Add the control flow edges we need.
+ MBB.addSuccessor(ContinueMBB);
+ MBB.addSuccessor(RoundMBB);
+ RoundMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(ContinueMBB);
+ LoopMBB->addSuccessor(LoopMBB);
+
+ // Mark all the instructions added to the prolog as frame setup.
+ if (InProlog) {
+ for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+ BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *RoundMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *LoopMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+ CMBBI != ContinueMBBI; ++CMBBI) {
+ CMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Possible TODO: physreg liveness for InProlog case.
+
+ return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
unsigned CallOp;
@@ -478,6 +728,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
Symbol = "_chkstk";
MachineInstrBuilder CI;
+ MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
// All current stack probes take AX and SP as input, clobber flags, and
// preserve all registers. x86_64 probes leave RSP unmodified.
@@ -507,6 +758,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
.addReg(X86::RSP)
.addReg(X86::RAX);
}
+
+ if (InProlog) {
+ // Apply the frame setup flag to all inserted instrs.
+ for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+ ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+
+ return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+ assert(InProlog && "ChkStkStub called outside prolog!");
+
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__chkstk_stub");
+
+ return MBBI;
}
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -526,7 +797,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
const MachineFrameInfo *MFI = MF.getFrameInfo();
uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
unsigned StackAlign = getStackAlignment();
- if (ForceStackAlign) {
+ if (MF.getFunction()->hasFnAttribute("stackrealign")) {
if (MFI->hasCalls())
MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
else if (MaxAlign < SlotSize)
@@ -537,15 +808,14 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- DebugLoc DL,
+ DebugLoc DL, unsigned Reg,
uint64_t MaxAlign) const {
uint64_t Val = -MaxAlign;
- MachineInstr *MI =
- BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)),
- StackPtr)
- .addReg(StackPtr)
- .addImm(Val)
- .setMIFlag(MachineInstr::FrameSetup);
+ unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+ .addReg(Reg)
+ .addImm(Val)
+ .setMIFlag(MachineInstr::FrameSetup);
// The EFLAGS implicit def is dead.
MI->getOperand(3).setIsDead();
@@ -646,6 +916,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
+ bool IsFunclet = MBB.isEHFuncletEntry();
+ EHPersonality Personality = EHPersonality::Unknown;
+ if (Fn->hasPersonalityFn())
+ Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ bool FnHasClrFunclet =
+ MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+ bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
bool HasFP = hasFP(MF);
bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -655,9 +932,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
unsigned FramePtr = TRI->getFrameRegister(MF);
const unsigned MachineFramePtr =
STI.isTarget64BitILP32()
- ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
- : FramePtr;
+ ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
unsigned BasePtr = TRI->getBaseRegister();
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
DebugLoc DL;
// Add RETADDR move area to callee saved frame size.
@@ -723,6 +1002,24 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
uint64_t NumBytes = 0;
int stackGrowth = -SlotSize;
+ // Find the funclet establisher parameter
+ unsigned Establisher = X86::NoRegister;
+ if (IsClrFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+ else if (IsFunclet)
+ Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+ if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+ // Immediately spill establisher into the home slot.
+ // The runtime cares about this.
+ // MOV64mr %rdx, 16(%rsp)
+ unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+ .addReg(Establisher)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MBB.addLiveIn(Establisher);
+ }
+
if (HasFP) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
@@ -739,7 +1036,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Get the offset of the stack slot for the EBP register, which is
// guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
// Update the frame offset adjustment.
- MFI->setOffsetAdjustment(-NumBytes);
+ if (!IsFunclet)
+ MFI->setOffsetAdjustment(-NumBytes);
+ else
+ assert(MFI->getOffsetAdjustment() == -(int)NumBytes &&
+ "should calculate same local variable offset for funclets");
// Save EBP/RBP into the appropriate stack slot.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
@@ -765,35 +1066,46 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- if (!IsWin64Prologue) {
+ if (!IsWin64Prologue && !IsFunclet) {
// Update EBP with the new base value.
BuildMI(MBB, MBBI, DL,
TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
FramePtr)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
- }
- if (NeedsDwarfCFI) {
- // Mark effective beginning of when frame pointer becomes valid.
- // Define the current CFA to use the EBP/RBP register.
- unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
- BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+ if (NeedsDwarfCFI) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ // Define the current CFA to use the EBP/RBP register.
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+ nullptr, DwarfFramePtr));
+ }
}
- // Mark the FramePtr as live-in in every block.
- for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
- I->addLiveIn(MachineFramePtr);
+ // Mark the FramePtr as live-in in every block. Don't do this again for
+ // funclet prologues.
+ if (!IsFunclet) {
+ for (MachineBasicBlock &EveryMBB : MF)
+ EveryMBB.addLiveIn(MachineFramePtr);
+ }
} else {
+ assert(!IsFunclet && "funclets without FPs not yet implemented");
NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
}
+ // For EH funclets, only allocate enough space for outgoing calls. Save the
+ // NumBytes value that we would've used for the parent frame.
+ unsigned ParentFrameNumBytes = NumBytes;
+ if (IsFunclet)
+ NumBytes = getWinEHFuncletFrameSize(MF);
+
// Skip the callee-saved push instructions.
bool PushedRegs = false;
int StackOffset = 2 * stackGrowth;
while (MBBI != MBB.end() &&
+ MBBI->getFlag(MachineInstr::FrameSetup) &&
(MBBI->getOpcode() == X86::PUSH32r ||
MBBI->getOpcode() == X86::PUSH64r)) {
PushedRegs = true;
@@ -818,9 +1130,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Realign stack after we pushed callee-saved registers (so that we'll be
// able to calculate their offsets from the frame pointer).
// Don't do this for Win64, it needs to realign the stack after the prologue.
- if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+ if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
assert(HasFP && "There should be a frame pointer if stack is realigned.");
- BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+ BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
}
// If there is an SUB32ri of ESP immediately before this instruction, merge
@@ -839,7 +1151,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// increments is necessary to ensure that the guard pages used by the OS
// virtual memory manager are allocated in correct sequence.
uint64_t AlignedNumBytes = NumBytes;
- if (IsWin64Prologue && TRI->needsStackRealignment(MF))
+ if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
// Check whether EAX is livein for this function.
@@ -876,26 +1188,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
// We'll also use 4 already allocated bytes for EAX.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
- .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
}
- // Save a pointer to the MI where we set AX.
- MachineBasicBlock::iterator SetRAX = MBBI;
- --SetRAX;
-
// Call __chkstk, __chkstk_ms, or __alloca.
- emitStackProbeCall(MF, MBB, MBBI, DL);
-
- // Apply the frame setup flag to all inserted instrs.
- for (; SetRAX != MBBI; ++SetRAX)
- SetRAX->setFlag(MachineInstr::FrameSetup);
+ emitStackProbe(MF, MBB, MBBI, DL, true);
if (isEAXAlive) {
// Restore EAX
- MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
- X86::EAX),
- StackPtr, false, NumBytes - 4);
+ MachineInstr *MI =
+ addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);
}
@@ -909,19 +1213,72 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
int SEHFrameOffset = 0;
+ unsigned SPOrEstablisher;
+ if (IsFunclet) {
+ if (IsClrFunclet) {
+ // The establisher parameter passed to a CLR funclet is actually a pointer
+ // to the (mostly empty) frame of its nearest enclosing funclet; we have
+ // to find the root function establisher frame by loading the PSPSym from
+ // the intermediate frame.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ MachinePointerInfo NoInfo;
+ MBB.addLiveIn(Establisher);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+ Establisher, false, PSPSlotOffset)
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+ ;
+ // Save the root establisher back into the current funclet's (mostly
+ // empty) frame, in case a sub-funclet or the GC needs it.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+ false, PSPSlotOffset)
+ .addReg(Establisher)
+ .addMemOperand(
+ MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+ SPOrEstablisher = Establisher;
+ } else {
+ SPOrEstablisher = StackPtr;
+ }
+
if (IsWin64Prologue && HasFP) {
- SEHFrameOffset = calculateSetFPREG(NumBytes);
+ // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+ // this calculation on the incoming establisher, which holds the value of
+ // RSP from the parent frame at the end of the prologue.
+ SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
if (SEHFrameOffset)
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
- StackPtr, false, SEHFrameOffset);
+ SPOrEstablisher, false, SEHFrameOffset);
else
- BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+ .addReg(SPOrEstablisher);
- if (NeedsWinCFI)
+ // If this is not a funclet, emit the CFI describing our frame pointer.
+ if (NeedsWinCFI && !IsFunclet) {
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
.addImm(FramePtr)
.addImm(SEHFrameOffset)
.setMIFlag(MachineInstr::FrameSetup);
+ if (isAsynchronousEHPersonality(Personality))
+ MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+ }
+ } else if (IsFunclet && STI.is32Bit()) {
+ // Reset EBP / ESI to something good for funclets.
+ MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+ // If we're a catch funclet, we can be returned to via catchret. Save ESP
+ // into the registration node so that the runtime will restore it for us.
+ if (!MBB.isCleanupFuncletEntry()) {
+ assert(Personality == EHPersonality::MSVC_CXX);
+ unsigned FrameReg;
+ int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+ // ESP is the first field, so no extra displacement is needed.
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+ false, EHRegOffset)
+ .addReg(X86::ESP);
+ }
}
while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
@@ -932,7 +1289,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int FI;
if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
if (X86::FR64RegClass.contains(Reg)) {
- int Offset = getFrameIndexOffset(MF, FI);
+ unsigned IgnoredFrameReg;
+ int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
Offset += SEHFrameOffset;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
@@ -948,14 +1306,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
.setMIFlag(MachineInstr::FrameSetup);
+ if (FnHasClrFunclet && !IsFunclet) {
+ // Save the so-called Initial-SP (i.e. the value of the stack pointer
+ // immediately after the prolog) into the PSPSlot so that funclets
+ // and the GC can recover it.
+ unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+ auto PSPInfo = MachinePointerInfo::getFixedStack(
+ MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+ PSPSlotOffset)
+ .addReg(StackPtr)
+ .addMemOperand(MF.getMachineMemOperand(
+ PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, SlotSize));
+ }
+
// Realign stack after we spilled callee-saved registers (so that we'll be
// able to calculate their offsets from the frame pointer).
// Win64 requires aligning the stack after the prologue.
if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
assert(HasFP && "There should be a frame pointer if stack is realigned.");
- BuildStackAlignAND(MBB, MBBI, DL, MaxAlign);
+ BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
}
+ // We already dealt with stack realignment and funclets above.
+ if (IsFunclet && STI.is32Bit())
+ return;
+
// If we need a base pointer, set it up here. It's whatever the value
// of the stack pointer is at this point. Any variable size objects
// will be allocated after this, so we can still use the base pointer
@@ -964,7 +1341,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Update the base pointer with the current stack pointer.
unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
- .addReg(StackPtr)
+ .addReg(SPOrEstablisher)
.setMIFlag(MachineInstr::FrameSetup);
if (X86FI->getRestoreBasePointer()) {
// Stash value of base pointer. Saving RSP instead of EBP shortens
@@ -972,18 +1349,21 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
FramePtr, true, X86FI->getRestoreBasePointerOffset())
- .addReg(StackPtr)
+ .addReg(SPOrEstablisher)
.setMIFlag(MachineInstr::FrameSetup);
}
- if (X86FI->getHasSEHFramePtrSave()) {
+ if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
// Stash the value of the frame pointer relative to the base pointer for
// Win32 EH. This supports Win32 EH, which does the inverse of the above:
// it recovers the frame pointer from the base pointer rather than the
// other way around.
unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
- addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
- getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex()))
+ unsigned UsedReg;
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
.addReg(FramePtr)
.setMIFlag(MachineInstr::FrameSetup);
}
@@ -1015,6 +1395,69 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue(
return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
}
+static bool isFuncletReturnInstr(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame. The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals. To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+ const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+ // getFrameIndexReferenceFromSP has an out ref parameter for the stack
+ // pointer register; pass a dummy that we ignore
+ unsigned SPReg;
+ int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg);
+ assert(Offset >= 0);
+ return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ // This is the size of the pushed CSRs.
+ unsigned CSSize =
+ MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // This is the amount of stack a funclet needs to allocate.
+ unsigned UsedSize;
+ EHPersonality Personality =
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ // CLR funclets need to hold enough space to include the PSPSym, at the
+ // same offset from the stack pointer (immediately after the prolog) as it
+ // resides at in the main function.
+ UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+ } else {
+ // Other funclets just need enough stack for outgoing call arguments.
+ UsedSize = MF.getFrameInfo()->getMaxCallFrameSize();
+ }
+ // RBP is not included in the callee saved register block. After pushing RBP,
+ // everything is 16 byte aligned. Everything we allocate before an outgoing
+ // call must also be 16 byte aligned.
+ unsigned FrameSizeMinusRBP =
+ RoundUpToAlignment(CSSize + UsedSize, getStackAlignment());
+ // Subtract out the size of the callee saved registers. This is how much stack
+ // each funclet will allocate.
+ return FrameSizeMinusRBP - CSSize;
+}
+
void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1027,12 +1470,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
const bool Is64BitILP32 = STI.isTarget64BitILP32();
unsigned FramePtr = TRI->getFrameRegister(MF);
unsigned MachineFramePtr =
- Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false)
- : FramePtr;
+ Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWinCFI =
IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+ bool IsFunclet = isFuncletReturnInstr(MBBI);
+ MachineBasicBlock *TargetMBB = nullptr;
// Get the number of bytes to allocate from the FrameInfo.
uint64_t StackSize = MFI->getStackSize();
@@ -1040,7 +1484,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
uint64_t NumBytes = 0;
- if (hasFP(MF)) {
+ if (MBBI->getOpcode() == X86::CATCHRET) {
+ // SEH shouldn't use catchret.
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+ "SEH should not use CATCHRET");
+
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ TargetMBB = MBBI->getOperand(0).getMBB();
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (MBBI->getOpcode() == X86::CLEANUPRET) {
+ NumBytes = getWinEHFuncletFrameSize(MF);
+ assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ } else if (hasFP(MF)) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
NumBytes = FrameSize - CSSize;
@@ -1052,7 +1516,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Pop EBP.
BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
} else {
NumBytes = StackSize - CSSize;
}
@@ -1063,26 +1528,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator PI = std::prev(MBBI);
unsigned Opc = PI->getOpcode();
- if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
- !PI->isTerminator())
+ if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ Opc != X86::DBG_VALUE && !PI->isTerminator())
break;
--MBBI;
}
MachineBasicBlock::iterator FirstCSPop = MBBI;
+ if (TargetMBB) {
+ // Fill EAX/RAX with the address of the target block.
+ unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+ if (STI.is64Bit()) {
+ // LEA64r TargetMBB(%rip), %rax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(TargetMBB)
+ .addReg(0);
+ } else {
+ // MOV32ri $TargetMBB, %eax
+ BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+ .addMBB(TargetMBB);
+ }
+ // Record that we've taken the address of TargetMBB and no longer just
+ // reference it in a terminator.
+ TargetMBB->setHasAddressTaken();
+ }
+
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
// If there is an ADD32ri or SUB32ri of ESP immediately before this
// instruction, merge the two instructions.
if (NumBytes || MFI->hasVarSizedObjects())
- mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+ NumBytes += mergeSPUpdates(MBB, MBBI, true);
// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
- // realigned.
- if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+ // realigned. Don't do this if this was a funclet epilogue, since the funclets
+ // will not do realignment or dynamic stack allocation.
+ if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) &&
+ !IsFunclet) {
if (TRI->needsStackRealignment(MF))
MBBI = FirstCSPop;
unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
@@ -1134,9 +1623,24 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
}
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // We can't calculate offset from frame pointer if the stack is realigned,
+ // so enforce usage of stack/base pointer. The base pointer is used when we
+ // have dynamic allocas in addition to dynamic realignment.
+ if (TRI->hasBasePointer(MF))
+ FrameReg = TRI->getBaseRegister();
+ else if (TRI->needsStackRealignment(MF))
+ FrameReg = TRI->getStackRegister();
+ else
+ FrameReg = TRI->getFrameRegister(MF);
+
// Offset will hold the offset from the stack pointer at function entry to the
// object.
// We need to factor in additional offsets applied during the prologue to the
@@ -1207,48 +1711,62 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
return Offset + FPDelta;
}
-int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
- // We can't calculate offset from frame pointer if the stack is realigned,
- // so enforce usage of stack/base pointer. The base pointer is used when we
- // have dynamic allocas in addition to dynamic realignment.
- if (TRI->hasBasePointer(MF))
- FrameReg = TRI->getBaseRegister();
- else if (TRI->needsStackRealignment(MF))
- FrameReg = TRI->getStackRegister();
- else
- FrameReg = TRI->getFrameRegister(MF);
- return getFrameIndexOffset(MF, FI);
-}
-
-// Simplified from getFrameIndexOffset keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
+// Simplified from getFrameIndexReference keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
// Does not include any dynamic realign.
const uint64_t StackSize = MFI->getStackSize();
{
#ifndef NDEBUG
- // Note: LLVM arranges the stack as:
- // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
- // > "Stack Slots" (<--SP)
- // We can always address StackSlots from RSP. We can usually (unless
- // needsStackRealignment) address CSRs from RSP, but sometimes need to
- // address them from RBP. FixedObjects can be placed anywhere in the stack
- // frame depending on their specific requirements (i.e. we can actually
- // refer to arguments to the function which are stored in the *callers*
- // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
- // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
-
- assert(!TRI->hasBasePointer(MF) && "we don't handle this case");
-
- // We don't handle tail calls, and shouldn't be seeing them
- // either.
+ // LLVM arranges the stack as follows:
+ // ...
+ // ARG2
+ // ARG1
+ // RETADDR
+ // PUSH RBP <-- RBP points here
+ // PUSH CSRs
+ // ~~~~~~~ <-- possible stack realignment (non-win64)
+ // ...
+ // STACK OBJECTS
+ // ... <-- RSP after prologue points here
+ // ~~~~~~~ <-- possible stack realignment (win64)
+ //
+ // if (hasVarSizedObjects()):
+ // ... <-- "base pointer" (ESI/RBX) points here
+ // DYNAMIC ALLOCAS
+ // ... <-- RSP points here
+ //
+ // Case 1: In the simple case of no stack realignment and no dynamic
+ // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+ // with fixed offsets from RSP.
+ //
+ // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+ // stack objects are addressed with RBP and regular stack objects with RSP.
+ //
+ // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+ // to address stack arguments for outgoing calls and nothing else. The "base
+ // pointer" points to local variables, and RBP points to fixed objects.
+ //
+ // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+ // answer we give is relative to the SP after the prologue, and not the
+ // SP in the middle of the function.
+
+ assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) ||
+ STI.isTargetWin64()) &&
+ "offset from fixed object to SP is not static");
+
+ // We don't handle tail calls, and shouldn't be seeing them either.
int TailCallReturnAddrDelta =
MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
#endif
}
+ // Fill in FrameReg output argument.
+ FrameReg = TRI->getStackRegister();
+
// This is how the math works out:
//
// %rsp grows (i.e. gets lower) left to right. Each box below is
@@ -1280,15 +1798,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F
return Offset + StackSize;
}
-// Simplified from getFrameIndexReference keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
- int FI,
- unsigned &FrameReg) const {
- assert(!TRI->hasBasePointer(MF) && "we don't handle this case");
-
- FrameReg = TRI->getStackRegister();
- return getFrameIndexOffsetFromSP(MF, FI);
-}
bool X86FrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
@@ -1358,6 +1867,11 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBB.findDebugLoc(MI);
+ // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+ // for us, and there are no XMM CSRs on Win32.
+ if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+ return true;
+
// Push GPRs. It increases frame size.
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
for (unsigned i = CSI.size(); i != 0; --i) {
@@ -1399,6 +1913,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (CSI.empty())
return false;
+ if (isFuncletReturnInstr(MI) && STI.isOSWindows()) {
+ // Don't restore CSRs in 32-bit EH funclets. Matches
+ // spillCalleeSavedRegisters.
+ if (STI.is32Bit())
+ return true;
+ // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+ // funclets. emitEpilogue transforms these to normal jumps.
+ if (MI->getOpcode() == X86::CATCHRET) {
+ const Function *Func = MBB.getParent()->getFunction();
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(Func->getPersonalityFn()));
+ if (IsSEH)
+ return true;
+ }
+ }
+
DebugLoc DL = MBB.findDebugLoc(MI);
// Reload XMMs from stack frame.
@@ -1420,7 +1950,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
!X86::GR32RegClass.contains(Reg))
continue;
- BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
+ BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+ .setMIFlag(MachineInstr::FrameDestroy);
}
return true;
}
@@ -1450,8 +1981,16 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
}
// Spill the BasePtr if it's used.
- if (TRI->hasBasePointer(MF))
+ if (TRI->hasBasePointer(MF)) {
SavedRegs.set(TRI->getBaseRegister());
+
+ // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+ if (MF.getMMI().hasEHFunclets()) {
+ int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize);
+ X86FI->setHasSEHFramePtrSave(true);
+ X86FI->setSEHFramePtrSaveIndex(FI);
+ }
+ }
}
static bool
@@ -1545,11 +2084,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
// The MOV R10, RAX needs to be in a different block, since the RET we emit in
// allocMBB needs to be last (terminating) instruction.
- for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(),
- e = PrologueMBB.livein_end();
- i != e; i++) {
- allocMBB->addLiveIn(*i);
- checkMBB->addLiveIn(*i);
+ for (const auto &LI : PrologueMBB.liveins()) {
+ allocMBB->addLiveIn(LI);
+ checkMBB->addLiveIn(LI);
}
if (IsNested)
@@ -1682,8 +2219,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
.addImm(StackSize);
BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
.addImm(X86FI->getArgumentStackSize());
- MF.getRegInfo().setPhysRegUsed(Reg10);
- MF.getRegInfo().setPhysRegUsed(Reg11);
} else {
BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
.addImm(X86FI->getArgumentStackSize());
@@ -1821,11 +2356,9 @@ void X86FrameLowering::adjustForHiPEPrologue(
MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
- for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(),
- E = PrologueMBB.livein_end();
- I != E; I++) {
- stackCheckMBB->addLiveIn(*I);
- incStackMBB->addLiveIn(*I);
+ for (const auto &LI : PrologueMBB.liveins()) {
+ stackCheckMBB->addLiveIn(LI);
+ incStackMBB->addLiveIn(LI);
}
MF.push_front(incStackMBB);
@@ -1870,16 +2403,84 @@ void X86FrameLowering::adjustForHiPEPrologue(
.addReg(ScratchReg), PReg, false, SPLimitOffset);
BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
- stackCheckMBB->addSuccessor(&PrologueMBB, 99);
- stackCheckMBB->addSuccessor(incStackMBB, 1);
- incStackMBB->addSuccessor(&PrologueMBB, 99);
- incStackMBB->addSuccessor(incStackMBB, 1);
+ stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+ stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+ incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+ incStackMBB->addSuccessor(incStackMBB, {1, 100});
}
#ifdef XDEBUG
MF.verify();
#endif
}
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const {
+
+ if (Offset <= 0)
+ return false;
+
+ if (Offset % SlotSize)
+ return false;
+
+ int NumPops = Offset / SlotSize;
+ // This is only worth it if we have at most 2 pops.
+ if (NumPops != 1 && NumPops != 2)
+ return false;
+
+ // Handle only the trivial case where the adjustment directly follows
+ // a call. This is the most common one, anyway.
+ if (MBBI == MBB.begin())
+ return false;
+ MachineBasicBlock::iterator Prev = std::prev(MBBI);
+ if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+ return false;
+
+ unsigned Regs[2];
+ unsigned FoundRegs = 0;
+
+ auto RegMask = Prev->getOperand(1);
+
+ auto &RegClass =
+ Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+ // Try to find up to NumPops free registers.
+ for (auto Candidate : RegClass) {
+
+ // Poor man's liveness:
+ // Since we're immediately after a call, any register that is clobbered
+ // by the call and not defined by it can be considered dead.
+ if (!RegMask.clobbersPhysReg(Candidate))
+ continue;
+
+ bool IsDef = false;
+ for (const MachineOperand &MO : Prev->implicit_operands()) {
+ if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) {
+ IsDef = true;
+ break;
+ }
+ }
+
+ if (IsDef)
+ continue;
+
+ Regs[FoundRegs++] = Candidate;
+ if (FoundRegs == (unsigned)NumPops)
+ break;
+ }
+
+ if (FoundRegs == 0)
+ return false;
+
+ // If we found only one free register, but need two, reuse the same one twice.
+ while (FoundRegs < (unsigned)NumPops)
+ Regs[FoundRegs++] = Regs[0];
+
+ for (int i = 0; i < NumPops; ++i)
+ BuildMI(MBB, MBBI, DL,
+ TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+ return true;
+}
+
void X86FrameLowering::
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
@@ -1895,8 +2496,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// If the stack pointer can be changed after prologue, turn the
// adjcallstackup instruction into a 'sub ESP, <amt>' and the
// adjcallstackdown instruction into 'add ESP, <amt>'
- if (Amount == 0)
- return;
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
@@ -1904,15 +2503,68 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
unsigned StackAlign = getStackAlignment();
Amount = RoundUpToAlignment(Amount, StackAlign);
+ MachineModuleInfo &MMI = MF.getMMI();
+ const Function *Fn = MF.getFunction();
+ bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool DwarfCFI = !WindowsCFI &&
+ (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+ // If we have any exception handlers in this function, and we adjust
+ // the SP before calls, we may need to indicate this to the unwinder
+ // using GNU_ARGS_SIZE. Note that this may be necessary even when
+ // Amount == 0, because the preceding function may have set a non-0
+ // GNU_ARGS_SIZE.
+ // TODO: We don't need to reset this between subsequent functions,
+ // if it didn't change.
+ bool HasDwarfEHHandlers = !WindowsCFI &&
+ !MF.getMMI().getLandingPads().empty();
+
+ if (HasDwarfEHHandlers && !isDestroy &&
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+ if (Amount == 0)
+ return;
+
// Factor out the amount that gets handled inside the sequence
// (Pushes of argument for frame setup, callee pops for frame destroy)
Amount -= InternalAmt;
+ // TODO: This is needed only if we require precise CFA.
+ // If this is a callee-pop calling convention, emit a CFA adjust for
+ // the amount the callee popped.
+ if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
if (Amount) {
// Add Amount to SP to destroy a frame, and subtract to setup.
int Offset = isDestroy ? Amount : -Amount;
- BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+
+ if (!(Fn->optForMinSize() &&
+ adjustStackWithPops(MBB, I, DL, Offset)))
+ BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+ }
+
+ if (DwarfCFI && !hasFP(MF)) {
+ // If we don't have FP, but need to generate unwind information,
+ // we need to set the correct CFA offset after the stack adjustment.
+ // How much we adjust the CFA offset depends on whether we're emitting
+ // CFI only for EH purposes or for debugging. EH only requires the CFA
+ // offset to be correct at each call site, while for debugging we want
+ // it to be more precise.
+ int CFAOffset = Amount;
+ // TODO: When not using precise CFA, we also need to adjust for the
+ // InternalAmt here.
+
+ if (CFAOffset) {
+ CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+ BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+ }
}
+
return;
}
@@ -1933,12 +2585,136 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
assert(MBB.getParent() && "Block is not attached to a function!");
+ // Win64 has strict requirements in terms of epilogue and we are
+ // not taking a chance at messing with them.
+ // I.e., unless this block is already an exit block, we can't use
+ // it as an epilogue.
+ if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+ return false;
+
if (canUseLEAForSPInEpilogue(*MBB.getParent()))
return true;
// If we cannot use LEA to adjust SP, we may need to use ADD, which
- // clobbers the EFLAGS. Check that none of the terminators reads the
- // EFLAGS, and if one uses it, conservatively assume this is not
+ // clobbers the EFLAGS. Check that we do not need to preserve it,
+ // otherwise, conservatively assume this is not
// safe to insert the epilogue here.
- return !terminatorsNeedFlagsAsInput(MBB);
+ return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ // If we may need to emit frameless compact unwind information, give
+ // up as this is currently broken: PR25614.
+ return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool RestoreSP) const {
+ assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+ assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+ assert(STI.is32Bit() && !Uses64BitFramePtr &&
+ "restoring EBP/ESI on non-32-bit target");
+
+ MachineFunction &MF = *MBB.getParent();
+ unsigned FramePtr = TRI->getFrameRegister(MF);
+ unsigned BasePtr = TRI->getBaseRegister();
+ WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // FIXME: Don't set FrameSetup flag in catchret case.
+
+ int FI = FuncInfo.EHRegNodeFrameIndex;
+ int EHRegSize = MFI->getObjectSize(FI);
+
+ if (RestoreSP) {
+ // MOV32rm -EHRegSize(%ebp), %esp
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+ X86::EBP, true, -EHRegSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ unsigned UsedReg;
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+ int EndOffset = -EHRegOffset - EHRegSize;
+ FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+ if (UsedReg == FramePtr) {
+ // ADD $offset, %ebp
+ unsigned ADDri = getADDriOpcode(false, EndOffset);
+ BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+ .addReg(FramePtr)
+ .addImm(EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup)
+ ->getOperand(3)
+ .setIsDead();
+ assert(EndOffset >= 0 &&
+ "end of registration object above normal EBP position!");
+ } else if (UsedReg == BasePtr) {
+ // LEA offset(%ebp), %esi
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+ FramePtr, false, EndOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ // MOV32rm SavedEBPOffset(%esi), %ebp
+ assert(X86FI->getHasSEHFramePtrSave());
+ int Offset =
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ assert(UsedReg == BasePtr);
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+ UsedReg, true, Offset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+ }
+ return MBBI;
+}
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+ // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+ unsigned Offset = 16;
+ // RBP is immediately pushed.
+ Offset += SlotSize;
+ // All callee-saved registers are then pushed.
+ Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ // Every funclet allocates enough stack space for the largest outgoing call.
+ Offset += getWinEHFuncletFrameSize(MF);
+ return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF, RegScavenger *RS) const {
+ // If this function isn't doing Win64-style C++ EH, we don't need to do
+ // anything.
+ const Function *Fn = MF.getFunction();
+ if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() ||
+ classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+ return;
+
+ // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+ // relative to RSP after the prologue. Find the offset of the last fixed
+ // object, so that we can allocate a slot immediately following it. If there
+ // were no fixed objects, use offset -SlotSize, which is immediately after the
+ // return address. Fixed objects have negative frame indices.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ int64_t MinFixedObjOffset = -SlotSize;
+ for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
+ MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
+
+ int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+ int UnwindHelpFI =
+ MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI;
+
+ // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+ // other frame setup instructions.
+ MachineBasicBlock &MBB = MF.front();
+ auto MBBI = MBB.begin();
+ while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+ ++MBBI;
+
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+ UnwindHelpFI)
+ .addImm(-2);
}
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 495cfcd1c3f7..3ab41b4a5789 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -47,11 +47,17 @@ public:
unsigned StackPtr;
- /// Emit a call to the target's stack probe function. This is required for all
+ /// Emit target stack probe code. This is required for all
/// large stack allocations on Windows. The caller is required to materialize
- /// the number of bytes to probe in RAX/EAX.
- void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL) const;
+ /// the number of bytes to probe in RAX/EAX. Returns instruction just
+ /// after the expansion.
+ MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ bool InProlog) const;
+
+ /// Replace a StackProbe inline-stub with the actual probe code inline.
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const override;
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -91,11 +97,9 @@ public:
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
bool needsFrameIndexResolution(const MachineFunction &MF) const override;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
- int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
@@ -103,6 +107,11 @@ public:
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+ unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
/// Check the instruction before/after the passed instruction. If
/// it is an ADD/SUB/LEA instruction it is deleted argument and the
/// stack adjustment is returned as a positive value for ADD/LEA and
@@ -125,7 +134,9 @@ public:
/// \p MBB will be correctly handled by the target.
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
-private:
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
/// convertArgMovsToPushes - This method tries to convert a call sequence
/// that uses sub and mov instructions to put the argument onto the stack
/// into a series of pushes.
@@ -135,22 +146,56 @@ private:
MachineBasicBlock::iterator I,
uint64_t Amount) const;
- uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
-
/// Wraps up getting a CFI index and building a MachineInstr for it.
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
DebugLoc DL, MCCFIInstruction CFIInst) const;
+ /// Sets up EBP and optionally ESI based on the incoming EBP value. Only
+ /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+ MachineBasicBlock::iterator
+ restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ bool RestoreSP = false) const;
+
+private:
+ uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+ /// Emit target stack probe as a call to a helper function
+ MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit target stack probe as an inline sequence.
+ MachineInstr *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit a stub to later inline the target stack probe.
+ MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc DL,
- uint64_t MaxAlign) const;
+ unsigned Reg, uint64_t MaxAlign) const;
+
+ /// Make small positive stack adjustments using POPs.
+ bool adjustStackWithPops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ int Offset) const;
/// Adjusts the stack pointer using LEA, SUB, or ADD.
MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
DebugLoc DL, int64_t Offset,
bool InEpilogue) const;
+
+ unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+ unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d5351d25d6ed..4414e478b99b 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -46,9 +46,8 @@ STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
//===----------------------------------------------------------------------===//
namespace {
- /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
- /// SDValue's instead of register numbers for the leaves of the matched
- /// tree.
+ /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+ /// numbers for the leaves of the matched tree.
struct X86ISelAddressMode {
enum {
RegBase,
@@ -87,8 +86,7 @@ namespace {
IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
}
- /// isRIPRelative - Return true if this addressing mode is already RIP
- /// relative.
+ /// Return true if this addressing mode is already RIP-relative.
bool isRIPRelative() const {
if (BaseType != RegBase) return false;
if (RegisterSDNode *RegNode =
@@ -147,21 +145,25 @@ namespace {
namespace {
//===--------------------------------------------------------------------===//
- /// ISel - X86 specific code to select X86 machine instructions for
+ /// ISel - X86-specific code to select X86 machine instructions for
/// SelectionDAG operations.
///
class X86DAGToDAGISel final : public SelectionDAGISel {
- /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- /// OptForSize - If true, selector should try to optimize for code size
- /// instead of performance.
+ /// If true, selector should try to optimize for code size instead of
+ /// performance.
bool OptForSize;
+ /// If true, selector should try to optimize for minimum code size.
+ bool OptForMinSize;
+
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
+ : SelectionDAGISel(tm, OptLevel), OptForSize(false),
+ OptForMinSize(false) {}
const char *getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
@@ -184,8 +186,7 @@ namespace {
return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
}
- // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
- // sign extended field.
+ // True if the 64-bit immediate fits in a 32-bit sign-extended field.
inline bool i64immSExt32(SDNode *N) const {
uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
return (int64_t)v == (int32_t)v;
@@ -196,50 +197,50 @@ namespace {
private:
SDNode *Select(SDNode *N) override;
- SDNode *SelectGather(SDNode *N, unsigned Opc);
- SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
-
- bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
- bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
- bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
- bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
- bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ SDNode *selectGather(SDNode *N, unsigned Opc);
+ SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT);
+
+ bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+ bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+ bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth);
- bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
- bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
- bool SelectLEAAddr(SDValue N, SDValue &Base,
+ bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+ bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+ bool selectLEA64_32Addr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
+ bool selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+ bool selectScalarSSELoad(SDNode *Root, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment,
SDValue &NodeWithChain);
- bool TryFoldLoad(SDNode *P, SDValue N,
+ bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment);
- /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
- /// inline asm expressions.
+ /// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
- void EmitSpecialCodeForMain();
+ void emitSpecialCodeForMain();
inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL,
SDValue &Base, SDValue &Scale,
@@ -252,7 +253,7 @@ namespace {
: AM.Base_Reg;
Scale = getI8Imm(AM.Scale, DL);
Index = AM.IndexReg;
- // These are 32-bit even in 64-bit mode since RIP relative offset
+ // These are 32-bit even in 64-bit mode since RIP-relative offset
// is 32-bit.
if (AM.GV)
Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
@@ -283,32 +284,105 @@ namespace {
Segment = CurDAG->getRegister(0, MVT::i32);
}
- /// getI8Imm - Return a target constant with the specified value, of type
- /// i8.
+ // Utility function to determine whether we should avoid selecting
+ // immediate forms of instructions for better code size or not.
+ // At a high level, we'd like to avoid such instructions when
+ // we have similar constants used within the same basic block
+ // that can be kept in a register.
+ //
+ bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+ uint32_t UseCount = 0;
+
+ // Do not want to hoist if we're not optimizing for size.
+ // TODO: We'd like to remove this restriction.
+ // See the comment in X86InstrInfo.td for more info.
+ if (!OptForSize)
+ return false;
+
+ // Walk all the users of the immediate.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+ SDNode *User = *UI;
+
+ // This user is already selected. Count it as a legitimate use and
+ // move on.
+ if (User->isMachineOpcode()) {
+ UseCount++;
+ continue;
+ }
+
+ // We want to count stores of immediates as real uses.
+ if (User->getOpcode() == ISD::STORE &&
+ User->getOperand(1).getNode() == N) {
+ UseCount++;
+ continue;
+ }
+
+ // We don't currently match users that have > 2 operands (except
+ // for stores, which are handled above)
+ // Those instruction won't match in ISEL, for now, and would
+ // be counted incorrectly.
+ // This may change in the future as we add additional instruction
+ // types.
+ if (User->getNumOperands() != 2)
+ continue;
+
+ // Immediates that are used for offsets as part of stack
+ // manipulation should be left alone. These are typically
+ // used to indicate SP offsets for argument passing and
+ // will get pulled into stores/pushes (implicitly).
+ if (User->getOpcode() == X86ISD::ADD ||
+ User->getOpcode() == ISD::ADD ||
+ User->getOpcode() == X86ISD::SUB ||
+ User->getOpcode() == ISD::SUB) {
+
+ // Find the other operand of the add/sub.
+ SDValue OtherOp = User->getOperand(0);
+ if (OtherOp.getNode() == N)
+ OtherOp = User->getOperand(1);
+
+ // Don't count if the other operand is SP.
+ RegisterSDNode *RegNode;
+ if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+ (RegNode = dyn_cast_or_null<RegisterSDNode>(
+ OtherOp->getOperand(1).getNode())))
+ if ((RegNode->getReg() == X86::ESP) ||
+ (RegNode->getReg() == X86::RSP))
+ continue;
+ }
+
+ // ... otherwise, count this and move on.
+ UseCount++;
+ }
+
+ // If we have more than 1 use, then recommend for hoisting.
+ return (UseCount > 1);
+ }
+
+ /// Return a target constant with the specified value of type i8.
inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
}
- /// getI32Imm - Return a target constant with the specified value, of type
- /// i32.
+ /// Return a target constant with the specified value, of type i32.
inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
- /// getGlobalBaseReg - Return an SDNode that returns the value of
- /// the global base register. Output instructions required to
- /// initialize the global base register, if necessary.
- ///
+ /// Return an SDNode that returns the value of the global base register.
+ /// Output instructions required to initialize the global base register,
+ /// if necessary.
SDNode *getGlobalBaseReg();
- /// getTargetMachine - Return a reference to the TargetMachine, casted
- /// to the target-specific type.
+ /// Return a reference to the TargetMachine, casted to the target-specific
+ /// type.
const X86TargetMachine &getTargetMachine() const {
return static_cast<const X86TargetMachine &>(TM);
}
- /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
- /// to the target-specific type.
+ /// Return a reference to the TargetInstrInfo, casted to the target-specific
+ /// type.
const X86InstrInfo *getInstrInfo() const {
return Subtarget->getInstrInfo();
}
@@ -386,9 +460,9 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
return true;
}
-/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// Replace the original chain operand of the call with
/// load's chain operand and move load below the call's chain operand.
-static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
SDValue Call, SDValue OrigChain) {
SmallVector<SDValue, 8> Ops;
SDValue Chain = OrigChain.getOperand(0);
@@ -418,7 +492,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
}
-/// isCalleeLoad - Return true if call address is a load and it can be
+/// Return true if call address is a load and it can be
/// moved below CALLSEQ_START and the chains leading up to the call.
/// Return the CALLSEQ_START by reference as a second output.
/// In the case of a tail call, there isn't a callseq node between the call
@@ -461,12 +535,14 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
- // OptForSize is used in pattern predicates that isel is matching.
- OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+ // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ OptForSize = MF->getFunction()->optForSize();
+ OptForMinSize = MF->getFunction()->optForMinSize();
+ assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
- SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
if (OptLevel != CodeGenOpt::None &&
// Only does this when target favors doesn't favor register indirect
@@ -500,7 +576,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
SDValue Load = N->getOperand(1);
if (!isCalleeLoad(Load, Chain, HasCallSeq))
continue;
- MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
++NumLoadMoved;
continue;
}
@@ -577,9 +653,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
}
-/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
-/// the main function.
-void X86DAGToDAGISel::EmitSpecialCodeForMain() {
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
if (Subtarget->isTargetCygMing()) {
TargetLowering::ArgListTy Args;
auto &DL = CurDAG->getDataLayout();
@@ -599,7 +674,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() {
// If this is main, emit special code for main.
if (const Function *Fn = MF->getFunction())
if (Fn->hasExternalLinkage() && Fn->getName() == "main")
- EmitSpecialCodeForMain();
+ emitSpecialCodeForMain();
}
static bool isDispSafeForFrameIndex(int64_t Val) {
@@ -612,7 +687,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
return isInt<31>(Val);
}
-bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
X86ISelAddressMode &AM) {
// Cannot combine ExternalSymbol displacements with integer offsets.
if (Offset != 0 && (AM.ES || AM.MCSym))
@@ -634,7 +709,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
}
-bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
SDValue Address = N->getOperand(1);
// load gs:0 -> GS segment register.
@@ -658,11 +733,10 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
return true;
}
-/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
-/// into an addressing mode. These wrap things that will resolve down into a
-/// symbol reference. If no match is possible, this returns true, otherwise it
-/// returns false.
-bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
// If the addressing mode already has a symbol as the displacement, we can
// never match another symbol.
if (AM.hasSymbolicDisplacement())
@@ -685,7 +759,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
X86ISelAddressMode Backup = AM;
AM.GV = G->getGlobal();
AM.SymbolFlags = G->getTargetFlags();
- if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
+ if (foldOffsetIntoAddress(G->getOffset(), AM)) {
AM = Backup;
return true;
}
@@ -694,7 +768,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
AM.CP = CP->getConstVal();
AM.Align = CP->getAlignment();
AM.SymbolFlags = CP->getTargetFlags();
- if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
+ if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
AM = Backup;
return true;
}
@@ -710,7 +784,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
X86ISelAddressMode Backup = AM;
AM.BlockAddr = BA->getBlockAddress();
AM.SymbolFlags = BA->getTargetFlags();
- if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+ if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
AM = Backup;
return true;
}
@@ -758,11 +832,10 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
return true;
}
-/// MatchAddress - Add the specified node to the specified addressing mode,
-/// returning true if it cannot be done. This just pattern matches for the
-/// addressing mode.
-bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
- if (MatchAddressRecursively(N, AM, 0))
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+ if (matchAddressRecursively(N, AM, 0))
return true;
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
@@ -790,15 +863,49 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
return false;
}
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ // Add an artificial use to this node so that we can keep track of
+ // it if it gets CSE'd with a different node.
+ HandleSDNode Handle(N);
+
+ X86ISelAddressMode Backup = AM;
+ if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // Try again after commuting the operands.
+ if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ N = Handle.getValue();
+ AM.Base_Reg = N.getOperand(0);
+ AM.IndexReg = N.getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ N = Handle.getValue();
+ return true;
+}
+
// Insert a node into the DAG at least before the Pos node's position. This
// will reposition the node as needed, and will assign it a node ID that is <=
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
// IDs! The selection DAG must no longer depend on their uniqueness when this
// is used.
-static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
if (N.getNode()->getNodeId() == -1 ||
N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
- DAG.RepositionNode(Pos.getNode(), N.getNode());
+ DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
N.getNode()->setNodeId(Pos.getNode()->getNodeId());
}
}
@@ -807,7 +914,7 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
// safe. This allows us to convert the shift and and into an h-register
// extract and a scaled index. Returns false if the simplification is
// performed.
-static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
@@ -835,12 +942,12 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
- InsertDAGNode(DAG, N, Eight);
- InsertDAGNode(DAG, N, Srl);
- InsertDAGNode(DAG, N, NewMask);
- InsertDAGNode(DAG, N, And);
- InsertDAGNode(DAG, N, ShlCount);
- InsertDAGNode(DAG, N, Shl);
+ insertDAGNode(DAG, N, Eight);
+ insertDAGNode(DAG, N, Srl);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, And);
+ insertDAGNode(DAG, N, ShlCount);
+ insertDAGNode(DAG, N, Shl);
DAG.ReplaceAllUsesWith(N, Shl);
AM.IndexReg = And;
AM.Scale = (1 << ScaleLog);
@@ -850,7 +957,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
// allows us to fold the shift into this addressing mode. Returns false if the
// transform succeeded.
-static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
@@ -880,9 +987,9 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
- InsertDAGNode(DAG, N, NewMask);
- InsertDAGNode(DAG, N, NewAnd);
- InsertDAGNode(DAG, N, NewShift);
+ insertDAGNode(DAG, N, NewMask);
+ insertDAGNode(DAG, N, NewAnd);
+ insertDAGNode(DAG, N, NewShift);
DAG.ReplaceAllUsesWith(N, NewShift);
AM.Scale = 1 << ShiftAmt;
@@ -917,7 +1024,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
// Note that this function assumes the mask is provided as a mask *after* the
// value is shifted. The input chain may or may not match that, but computing
// such a mask is trivial.
-static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
@@ -973,7 +1080,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
assert(X.getValueType() != VT);
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
- InsertDAGNode(DAG, N, NewX);
+ insertDAGNode(DAG, N, NewX);
X = NewX;
}
SDLoc DL(N);
@@ -987,10 +1094,10 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
- InsertDAGNode(DAG, N, NewSRLAmt);
- InsertDAGNode(DAG, N, NewSRL);
- InsertDAGNode(DAG, N, NewSHLAmt);
- InsertDAGNode(DAG, N, NewSHL);
+ insertDAGNode(DAG, N, NewSRLAmt);
+ insertDAGNode(DAG, N, NewSRL);
+ insertDAGNode(DAG, N, NewSHLAmt);
+ insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
AM.Scale = 1 << AMShiftAmt;
@@ -998,7 +1105,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
return false;
}
-bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
SDLoc dl(N);
DEBUG({
@@ -1007,7 +1114,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
});
// Limit recursion.
if (Depth > 5)
- return MatchAddressBase(N, AM);
+ return matchAddressBase(N, AM);
// If this is already a %rip relative address, we can only merge immediates
// into it. Instead of handling this in every case, we handle it here.
@@ -1020,7 +1127,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
return true;
if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
- if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
+ if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
return false;
return true;
}
@@ -1038,19 +1145,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
case ISD::Constant: {
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
- if (!FoldOffsetIntoAddress(Val, AM))
+ if (!foldOffsetIntoAddress(Val, AM))
return false;
break;
}
case X86ISD::Wrapper:
case X86ISD::WrapperRIP:
- if (!MatchWrapper(N, AM))
+ if (!matchWrapper(N, AM))
return false;
break;
case ISD::LOAD:
- if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
+ if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
return false;
break;
@@ -1087,7 +1194,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
ConstantSDNode *AddVal =
cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
- if (!FoldOffsetIntoAddress(Disp, AM))
+ if (!foldOffsetIntoAddress(Disp, AM))
return false;
}
@@ -1119,7 +1226,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Try to fold the mask and shift into the scale, and return false if we
// succeed.
- if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
return false;
break;
}
@@ -1153,7 +1260,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
ConstantSDNode *AddVal =
cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
- if (FoldOffsetIntoAddress(Disp, AM))
+ if (foldOffsetIntoAddress(Disp, AM))
Reg = N.getNode()->getOperand(0);
} else {
Reg = N.getNode()->getOperand(0);
@@ -1179,7 +1286,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
- if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
AM = Backup;
break;
}
@@ -1227,56 +1334,26 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
AM.Scale = 1;
// Insert the new nodes into the topological ordering.
- InsertDAGNode(*CurDAG, N, Zero);
- InsertDAGNode(*CurDAG, N, Neg);
+ insertDAGNode(*CurDAG, N, Zero);
+ insertDAGNode(*CurDAG, N, Neg);
return false;
}
- case ISD::ADD: {
- // Add an artificial use to this node so that we can keep track of
- // it if it gets CSE'd with a different node.
- HandleSDNode Handle(N);
-
- X86ISelAddressMode Backup = AM;
- if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
- !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
- return false;
- AM = Backup;
-
- // Try again after commuting the operands.
- if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
- !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ case ISD::ADD:
+ if (!matchAdd(N, AM, Depth))
return false;
- AM = Backup;
-
- // If we couldn't fold both operands into the address at the same time,
- // see if we can just put each operand into a register and fold at least
- // the add.
- if (AM.BaseType == X86ISelAddressMode::RegBase &&
- !AM.Base_Reg.getNode() &&
- !AM.IndexReg.getNode()) {
- N = Handle.getValue();
- AM.Base_Reg = N.getOperand(0);
- AM.IndexReg = N.getOperand(1);
- AM.Scale = 1;
- return false;
- }
- N = Handle.getValue();
break;
- }
case ISD::OR:
- // Handle "X | C" as "X + C" iff X is known to have C bits clear.
- if (CurDAG->isBaseWithConstantOffset(N)) {
- X86ISelAddressMode Backup = AM;
- ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
-
- // Start with the LHS as an addr mode.
- if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
- !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
- return false;
- AM = Backup;
- }
+ // We want to look through a transform in InstCombine and DAGCombiner that
+ // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+ // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+ // An 'lea' can then be used to match the shift (multiply) and add:
+ // and $1, %esi
+ // lea (%rsi, %rdi, 8), %rax
+ if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+ !matchAdd(N, AM, Depth))
+ return false;
break;
case ISD::AND: {
@@ -1299,27 +1376,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
uint64_t Mask = N.getConstantOperandVal(1);
// Try to fold the mask and shift into an extract and scale.
- if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+ if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
return false;
// Try to fold the mask and shift directly into the scale.
- if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+ if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
return false;
// Try to swap the mask and shift to place shifts which can be done as
// a scale on the outside of the mask.
- if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+ if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
return false;
break;
}
}
- return MatchAddressBase(N, AM);
+ return matchAddressBase(N, AM);
}
-/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// Helper for MatchAddress. Add the specified node to the
/// specified addressing mode without any further recursion.
-bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
// Is the base register already occupied?
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
// If so, check to see if the scale index register is set.
@@ -1339,7 +1416,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
return false;
}
-bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
@@ -1362,7 +1439,7 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
// If Base is 0, the whole address is in index and the Scale is 1
if (isa<ConstantSDNode>(Base)) {
- assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() &&
+ assert(cast<ConstantSDNode>(Base)->isNullValue() &&
"Unexpected base in gather/scatter");
Scale = getI8Imm(1, DL);
Base = CurDAG->getRegister(0, MVT::i32);
@@ -1375,14 +1452,14 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
-/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// Returns true if it is able to pattern match an addressing mode.
/// It returns the operands which make up the maximal addressing mode it can
/// match by reference.
///
/// Parent is the parent node of the addr operand that is being matched. It
/// is always a load, store, atomic node, or null. It is only null when
/// checking memory operands for inline asm nodes.
-bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
@@ -1404,7 +1481,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
}
- if (MatchAddress(N, AM))
+ if (matchAddress(N, AM))
return false;
MVT VT = N.getSimpleValueType();
@@ -1420,14 +1497,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
-/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
-/// match a load whose top elements are either undef or zeros. The load flavor
-/// is derived from the type of N, which is either v4f32 or v2f64.
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
///
/// We also return:
/// PatternChainNode: this is the matched node that has a chain input and
/// output.
-bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
@@ -1439,7 +1516,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
- if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
return false;
return true;
}
@@ -1457,7 +1534,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
// Okay, this is a zero extending load. Fold it.
LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
- if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
return false;
PatternNodeWithChain = SDValue(LD, 0);
return true;
@@ -1466,7 +1543,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
}
-bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CN->getZExtValue();
if ((uint32_t)ImmVal != (uint64_t)ImmVal)
@@ -1495,10 +1572,10 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
return TM.getCodeModel() == CodeModel::Small;
}
-bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
- if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+ if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
return false;
SDLoc DL(N);
@@ -1533,9 +1610,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
return true;
}
-/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// Calls SelectAddr and determines if the maximal addressing
/// mode it matches can be cost effectively emitted as an LEA instruction.
-bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
@@ -1546,7 +1623,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
SDValue Copy = AM.Segment;
SDValue T = CurDAG->getRegister(0, MVT::i32);
AM.Segment = T;
- if (MatchAddress(N, AM))
+ if (matchAddress(N, AM))
return false;
assert (T == AM.Segment);
AM.Segment = Copy;
@@ -1572,13 +1649,12 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
Complexity++;
// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
- // to a LEA. This is determined with some expermentation but is by no means
+ // to a LEA. This is determined with some experimentation but is by no means
// optimal (especially for code size consideration). LEA is nice because of
// its three-address nature. Tweak the cost function again when we can run
// convertToThreeAddress() at register allocation time.
if (AM.hasSymbolicDisplacement()) {
- // For X86-64, we should always use lea to materialize RIP relative
- // addresses.
+ // For X86-64, always use LEA to materialize RIP-relative addresses.
if (Subtarget->is64Bit())
Complexity = 4;
else
@@ -1596,8 +1672,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
return true;
}
-/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
-bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
@@ -1621,7 +1697,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
}
-bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
@@ -1630,14 +1706,13 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
!IsLegalToFold(N, P, P, OptLevel))
return false;
- return SelectAddr(N.getNode(),
+ return selectAddr(N.getNode(),
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
-/// getGlobalBaseReg - Return an SDNode that returns the value of
-/// the global base register. Output instructions required to
-/// initialize the global base register, if necessary.
-///
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
auto &DL = MF->getDataLayout();
@@ -1828,7 +1903,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
return Val;
}
-SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
+SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) {
if (Node->hasAnyUseOfValue(0))
return nullptr;
@@ -1841,7 +1916,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
SDValue Ptr = Node->getOperand(1);
SDValue Val = Node->getOperand(2);
SDValue Base, Scale, Index, Disp, Segment;
- if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
+ if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
return nullptr;
// Which index into the table.
@@ -1933,9 +2008,9 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
return CurDAG->getMergeValues(RetVals, dl).getNode();
}
-/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
-/// any uses which require the SF or OF bits to be accurate.
-static bool HasNoSignedComparisonUses(SDNode *N) {
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
// Examine each user of the node.
for (SDNode::use_iterator UI = N->use_begin(),
UE = N->use_end(); UI != UE; ++UI) {
@@ -1995,9 +2070,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
return true;
}
-/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
-/// is suitable for doing the {load; increment or decrement; store} to modify
-/// transformation.
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
SDValue StoredVal, SelectionDAG *CurDAG,
LoadSDNode* &LoadNode, SDValue &InputChain) {
@@ -2081,8 +2155,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
return true;
}
-/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
-/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
if (Opc == X86ISD::DEC) {
if (LdVT == MVT::i64) return X86::DEC64m;
@@ -2099,9 +2173,8 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
llvm_unreachable("unrecognized size for LdVT");
}
-/// SelectGather - Customized ISel for GATHER operations.
-///
-SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+/// Customized ISel for GATHER operations.
+SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
// Operands of Gather: VSrc, Base, VIdx, VMask, Scale
SDValue Chain = Node->getOperand(0);
SDValue VSrc = Node->getOperand(2);
@@ -2148,6 +2221,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
+ case ISD::BRIND: {
+ if (Subtarget->isTargetNaCl())
+ // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+ // leave the instruction alone.
+ break;
+ if (Subtarget->isTarget64BitILP32()) {
+ // Converts a 32-bit register to a 64-bit, zero-extended version of
+ // it. This is needed because x86-64 can do many things, but jmp %r32
+ // ain't one of them.
+ const SDValue &Target = Node->getOperand(1);
+ assert(Target.getSimpleValueType() == llvm::MVT::i32);
+ SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+ SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+ Node->getOperand(0), ZextTarget);
+ ReplaceUses(SDValue(Node, 0), Brind);
+ SelectCode(ZextTarget.getNode());
+ SelectCode(Brind.getNode());
+ return nullptr;
+ }
+ break;
+ }
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
switch (IntNo) {
@@ -2190,7 +2284,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
}
- SDNode *RetVal = SelectGather(Node, Opc);
+ SDNode *RetVal = selectGather(Node, Opc);
if (RetVal)
// We already called ReplaceUses inside SelectGather.
return nullptr;
@@ -2217,7 +2311,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_ADD: {
- SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+ SDNode *RetVal = selectAtomicLoadArith(Node, NVT);
if (RetVal)
return RetVal;
break;
@@ -2404,10 +2498,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
// Multiply is commmutative.
if (!foldedLoad) {
- foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
if (foldedLoad)
std::swap(N0, N1);
}
@@ -2549,7 +2643,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
SDValue InFlag;
@@ -2557,7 +2651,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// Special case for div8, just use a move with zero extension to AX to
// clear the upper 8 bits (AH).
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
- if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
Move =
SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
@@ -2692,7 +2786,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N1 = Node->getOperand(1);
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
- HasNoSignedComparisonUses(Node))
+ hasNoSignedComparisonUses(Node))
N0 = N0.getOperand(0);
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
@@ -2709,7 +2803,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// For example, convert "testl %eax, $8" to "testb %al, $8"
if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
(!(C->getZExtValue() & 0x80) ||
- HasNoSignedComparisonUses(Node))) {
+ hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Reg = N0.getNode()->getOperand(0);
@@ -2743,7 +2837,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
// For example, "testl %eax, $2048" to "testb %ah, $8".
if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
(!(C->getZExtValue() & 0x8000) ||
- HasNoSignedComparisonUses(Node))) {
+ hasNoSignedComparisonUses(Node))) {
// Shift the immediate right by 8 bits.
SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
dl, MVT::i8);
@@ -2781,7 +2875,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
N0.getValueType() != MVT::i16 &&
(!(C->getZExtValue() & 0x8000) ||
- HasNoSignedComparisonUses(Node))) {
+ hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i16);
SDValue Reg = N0.getNode()->getOperand(0);
@@ -2804,7 +2898,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
N0.getValueType() == MVT::i64 &&
(!(C->getZExtValue() & 0x80000000) ||
- HasNoSignedComparisonUses(Node))) {
+ hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i32);
SDValue Reg = N0.getNode()->getOperand(0);
@@ -2854,7 +2948,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
break;
SDValue Base, Scale, Index, Disp, Segment;
- if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+ if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
Base, Scale, Index, Disp, Segment))
break;
@@ -2903,7 +2997,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
case InlineAsm::Constraint_v: // not offsetable ??
case InlineAsm::Constraint_m: // memory
case InlineAsm::Constraint_X:
- if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+ if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
return true;
break;
}
@@ -2916,9 +3010,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
return false;
}
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new X86DAGToDAGISel(TM, OptLevel);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0f29b514146c..0927c2f4fa50 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -67,19 +68,14 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
-// Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
- SDValue V2);
-
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
- TD = TM.getDataLayout();
+ MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
// Set up the TargetLowering object.
- static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
@@ -118,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
-
- // The _ftol2 runtime function has an unusual calling conv, which
- // is modeled by a special pseudo-instruction.
- setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
- setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
- setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
- setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
}
if (Subtarget->isTargetDarwin()) {
@@ -175,14 +164,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
if (Subtarget->is64Bit()) {
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+ // f32/f64 are legal, f80 is custom.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ else
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
} else if (!Subtarget->useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
// We have an algorithm for SSE2, and we turn this into a 64-bit
- // FILD for other targets.
+ // FILD or VCVTUSI2SS/SD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
}
@@ -206,23 +199,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
}
- // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
- // are Legal, f80 is custom lowered.
- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
-
// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
// this operation.
setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ if (!Subtarget->useSoftFloat()) {
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
} else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
}
// Handle FP_TO_UINT by promoting the destination to a larger signed
@@ -232,8 +231,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
if (Subtarget->is64Bit()) {
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+ // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ }
} else if (!Subtarget->useSoftFloat()) {
// Since AVX is a superset of SSE3, only check for SSE here.
if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
@@ -242,14 +247,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// the optimal thing for SSE vs. the default expansion in the legalizer.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
else
+ // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
// With SSE3 we can use fisttpll to convert to a signed i64; without
// SSE, we're stuck with a fistpll.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
- }
- if (isTargetFTOL()) {
- // Use the _ftol2 runtime function, which has a pseudo-instruction
- // to handle its weird calling convention.
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
}
@@ -274,8 +276,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
- for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
- MVT VT = IntVTs[i];
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
@@ -295,6 +296,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC , MVT::f32, Expand);
setOperationAction(ISD::BR_CC , MVT::f64, Expand);
setOperationAction(ISD::BR_CC , MVT::f80, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f128, Expand);
setOperationAction(ISD::BR_CC , MVT::i8, Expand);
setOperationAction(ISD::BR_CC , MVT::i16, Expand);
setOperationAction(ISD::BR_CC , MVT::i32, Expand);
@@ -302,6 +304,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f128, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
@@ -312,7 +315,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
- setOperationAction(ISD::FREM , MVT::f32 , Expand);
+
+ if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
+ // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+ // is. We should promote the value to 64-bits to solve this.
+ // This is what the CRT headers do - `fmodf` is an inline header
+ // function casting to f64 and calling `fmod`.
+ setOperationAction(ISD::FREM , MVT::f32 , Promote);
+ } else {
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ }
+
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
@@ -404,15 +417,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT , MVT::f32 , Custom);
setOperationAction(ISD::SELECT , MVT::f64 , Custom);
setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f128 , Custom);
setOperationAction(ISD::SETCC , MVT::i8 , Custom);
setOperationAction(ISD::SETCC , MVT::i16 , Custom);
setOperationAction(ISD::SETCC , MVT::i32 , Custom);
setOperationAction(ISD::SETCC , MVT::f32 , Custom);
setOperationAction(ISD::SETCC , MVT::f64 , Custom);
setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f128 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i32 , Custom);
if (Subtarget->is64Bit()) {
setOperationAction(ISD::SELECT , MVT::i64 , Custom);
setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCCE , MVT::i64 , Custom);
}
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
@@ -456,8 +475,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// Expand certain atomics
- for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
- MVT VT = IntVTs[i];
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
@@ -473,13 +491,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
- if (Subtarget->is64Bit()) {
- setExceptionPointerRegister(X86::RAX);
- setExceptionSelectorRegister(X86::RDX);
- } else {
- setExceptionPointerRegister(X86::EAX);
- setExceptionSelectorRegister(X86::EDX);
- }
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
@@ -492,8 +503,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
- if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
- // TargetInfo::X86_64ABIBuiltinVaList
+ if (Subtarget->is64Bit()) {
setOperationAction(ISD::VAARG , MVT::Other, Custom);
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
} else {
@@ -505,7 +515,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
@@ -613,8 +623,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
- // Long double always uses X87.
+ // Long double always uses X87, except f128 in MMX.
if (!Subtarget->useSoftFloat()) {
+ if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::f128, &X86::FR128RegClass);
+ ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+ setOperationAction(ISD::FABS , MVT::f128, Custom);
+ setOperationAction(ISD::FNEG , MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+ }
+
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -846,15 +864,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+ // ISD::CTTZ v2i64 - scalarization is faster.
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
+
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
- for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to custom lower non-power-of-2 vectors
- if (!isPowerOf2_32(VT.getVectorNumElements()))
- continue;
- // Do not attempt to custom lower non-128-bit vectors
- if (!VT.is128BitVector())
- continue;
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -892,13 +912,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
- for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-128-bit vectors
- if (!VT.is128BitVector())
- continue;
-
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::AND, VT, Promote);
AddPromotedToType (ISD::AND, VT, MVT::v2i64);
setOperationAction(ISD::OR, VT, Promote);
@@ -1036,6 +1050,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
}
+ if (Subtarget->hasXOP()) {
+ setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v2i64, Custom);
+ setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i64, Custom);
+ }
+
if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
@@ -1126,7 +1151,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
- if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
+ setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+
+ if (Subtarget->hasAnyFMA()) {
setOperationAction(ISD::FMA, MVT::v8f32, Legal);
setOperationAction(ISD::FMA, MVT::v4f64, Legal);
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
@@ -1202,6 +1236,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i32, Custom);
setOperationAction(ISD::MUL, MVT::v16i16, Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SMAX, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMAX, MVT::v16i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v8i32, Custom);
+ setOperationAction(ISD::UMAX, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMAX, MVT::v16i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMIN, MVT::v32i8, Custom);
+ setOperationAction(ISD::SMIN, MVT::v16i16, Custom);
+ setOperationAction(ISD::SMIN, MVT::v8i32, Custom);
+ setOperationAction(ISD::UMIN, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMIN, MVT::v16i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v8i32, Custom);
}
// In the customized shift lowering, the legal cases in AVX2 will be
@@ -1243,15 +1290,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget->hasInt256())
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
- for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-256-bit vectors
- if (!VT.is256BitVector())
- continue;
-
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::AND, VT, Promote);
AddPromotedToType (ISD::AND, VT, MVT::v4i64);
setOperationAction(ISD::OR, VT, Promote);
@@ -1293,6 +1333,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
@@ -1311,6 +1352,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v16f32, Custom);
setOperationAction(ISD::FADD, MVT::v8f64, Legal);
setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
@@ -1318,19 +1360,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f64, Custom);
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
- }
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
@@ -1348,12 +1381,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (Subtarget->hasVLX()){
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ } else {
+ setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
+ setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
+ }
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
if (Subtarget->hasDQI()) {
- setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ }
+ }
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
}
setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
@@ -1386,7 +1469,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
@@ -1395,6 +1478,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
@@ -1439,9 +1523,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::XOR, MVT::v16i32, Legal);
if (Subtarget->hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
- }
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand);
+
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
+
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ } else {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand);
+ }
+ } // Subtarget->hasCDI()
+
if (Subtarget->hasDQI()) {
setOperationAction(ISD::MUL, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i64, Legal);
@@ -1455,7 +1579,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::OR, VT, Legal);
setOperationAction(ISD::XOR, VT, Legal);
}
- if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
+ if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
@@ -1481,15 +1605,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
}
}
- for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-512-bit vectors.
- if (!VT.is512BitVector())
- continue;
-
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
setOperationAction(ISD::SELECT, VT, Promote);
AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
}
@@ -1515,22 +1635,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
@@ -1541,19 +1674,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
- for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
- const MVT VT = (MVT::SimpleValueType)i;
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ if (Subtarget->hasVLX())
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
- const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (Subtarget->hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand);
+ }
- // Do not attempt to promote non-512-bit vectors.
- if (!VT.is512BitVector())
- continue;
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
- if (EltSize < 32) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
- }
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v8i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v8i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
}
}
@@ -1571,6 +1716,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
setOperationAction(ISD::AND, MVT::v8i32, Legal);
setOperationAction(ISD::OR, MVT::v8i32, Legal);
@@ -1595,8 +1742,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- if (!Subtarget->is64Bit())
+ if (!Subtarget->is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ }
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
@@ -1604,9 +1753,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
- for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget->is64Bit())
+ continue;
// Add/Sub/Mul with overflow operations are custom lowered.
- MVT VT = IntVTs[i];
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
@@ -1615,7 +1765,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMULO, VT, Custom);
}
-
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
@@ -1658,12 +1807,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::MSTORE);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
@@ -1671,24 +1824,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
computeRegisterProperties(Subtarget->getRegisterInfo());
- // On Darwin, -Os means optimize for size without hurting performance,
- // do not reduce the limit.
MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
- MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+ MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
- MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
- MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemmoveOptSize = 4;
setPrefLoopAlignment(4); // 2^4 bytes.
- // Predictable cmov don't hurt on atom because it's in-order.
+ // A predictable cmov does not hurt on an in-order CPU.
+ // FIXME: Use a CPU attribute to trigger this, not a CPU model.
PredictableSelectIsExpensive = !Subtarget->isAtom();
EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
@@ -1716,40 +1869,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
if (!VT.isVector())
return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
- const unsigned NumElts = VT.getVectorNumElements();
- const EVT EltVT = VT.getVectorElementType();
- if (VT.is512BitVector()) {
- if (Subtarget->hasAVX512())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- }
- if (Subtarget->hasBWI())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 32: return MVT::v32i1;
- case 64: return MVT::v64i1;
- }
- }
+ if (VT.isSimple()) {
+ MVT VVT = VT.getSimpleVT();
+ const unsigned NumElts = VVT.getVectorNumElements();
+ const MVT EltVT = VVT.getVectorElementType();
+ if (VVT.is512BitVector()) {
+ if (Subtarget->hasAVX512())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ }
+ if (Subtarget->hasBWI())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 32: return MVT::v32i1;
+ case 64: return MVT::v64i1;
+ }
+ }
- if (VT.is256BitVector() || VT.is128BitVector()) {
- if (Subtarget->hasVLX())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 2: return MVT::v2i1;
- case 4: return MVT::v4i1;
- case 8: return MVT::v8i1;
- }
- if (Subtarget->hasBWI() && Subtarget->hasVLX())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- case 32: return MVT::v32i1;
- }
+ if (VVT.is256BitVector() || VVT.is128BitVector()) {
+ if (Subtarget->hasVLX())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 2: return MVT::v2i1;
+ case 4: return MVT::v4i1;
+ case 8: return MVT::v8i1;
+ }
+ if (Subtarget->hasBWI() && Subtarget->hasVLX())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ case 32: return MVT::v32i1;
+ }
+ }
}
return VT.changeVectorElementTypeToInteger();
@@ -1769,9 +1925,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ for (auto *EltTy : STy->elements()) {
unsigned EltAlign = 0;
- getMaxByValAlign(STy->getElementType(i), EltAlign);
+ getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == 16)
@@ -1821,10 +1977,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
if ((!IsMemset || ZeroMemset) &&
!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
- (Subtarget->isUnalignedMemAccessFast() ||
+ (!Subtarget->isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
if (Size >= 32) {
+ // FIXME: Check if unaligned 32-byte accesses are slow.
if (Subtarget->hasInt256())
return MVT::v8i32;
if (Subtarget->hasFp256())
@@ -1842,6 +1999,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::f64;
}
}
+ // This is a compromise. If we reach here, unaligned accesses may be slow on
+ // this target. However, creating smaller, aligned accesses could be even
+ // slower and would certainly be a lot more code.
if (Subtarget->is64Bit() && Size >= 8)
return MVT::i64;
return MVT::i32;
@@ -1860,8 +2020,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned,
unsigned,
bool *Fast) const {
- if (Fast)
- *Fast = Subtarget->isUnalignedMemAccessFast();
+ if (Fast) {
+ switch (VT.getSizeInBits()) {
+ default:
+ // 8-byte and under are always assumed to be fast.
+ *Fast = true;
+ break;
+ case 128:
+ *Fast = !Subtarget->isUnalignedMem16Slow();
+ break;
+ case 256:
+ *Fast = !Subtarget->isUnalignedMem32Slow();
+ break;
+ // TODO: What about AVX-512 (512-bit) accesses?
+ }
+ }
+ // Misaligned accesses of any size are always allowed.
return true;
}
@@ -1964,6 +2138,32 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
return true;
}
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ unsigned AddressSpace, Offset;
+ if (Subtarget->is64Bit()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ Offset = 0x48;
+ if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+ AddressSpace = 256;
+ else
+ AddressSpace = 257;
+ } else {
+ // %gs:0x24 on i386
+ Offset = 0x24;
+ AddressSpace = 256;
+ }
+
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
@@ -1977,11 +2177,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
#include "X86GenCallingConv.inc"
-bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- LLVMContext &Context) const {
+bool X86TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
@@ -2001,6 +2199,9 @@ X86TargetLowering::LowerReturn(SDValue Chain,
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+ report_fatal_error("X86 interrupts may not return any value");
+
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
@@ -2025,7 +2226,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::AExt) {
- if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1)
+ if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
@@ -2114,7 +2315,10 @@ X86TargetLowering::LowerReturn(SDValue Chain,
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
+ X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+ if (CallConv == CallingConv::X86_INTR)
+ opcode = X86ISD::IRET;
+ return DAG.getNode(opcode, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2193,7 +2397,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
EVT CopyVT = VA.getLocVT();
// If this is x86-64, and we disabled SSE, we can't return FP values
- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
@@ -2244,28 +2448,28 @@ enum StructReturnType {
StackStructReturn
};
static StructReturnType
-callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
if (Outs.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
- if (Flags.isInReg())
+ if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
/// Determines whether a function uses struct return semantics.
static StructReturnType
-argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
if (Ins.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
- if (Flags.isInReg())
+ if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
@@ -2285,17 +2489,34 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
MachinePointerInfo(), MachinePointerInfo());
}
-/// Return true if the calling convention is one that
-/// supports tail call optimization.
-static bool IsTailCallConvention(CallingConv::ID CC) {
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE);
+ CC == CallingConv::HiPE || CC == CallingConv::HHVM);
}
-/// \brief Return true if the calling convention is a C calling convention.
-static bool IsCCallConvention(CallingConv::ID CC) {
- return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
- CC == CallingConv::X86_64_SysV);
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ // Callee pop conventions:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::X86_FastCall:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
}
bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
@@ -2306,19 +2527,12 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
CallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
- if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+ if (!mayTailCallThisCC(CalleeCC))
return false;
return true;
}
-/// Return true if the function is being made into
-/// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
- bool GuaranteedTailCallOpt) {
- return GuaranteedTailCallOpt && IsTailCallConvention(CC);
-}
-
SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain,
CallingConv::ID CallConv,
@@ -2329,7 +2543,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
@@ -2344,6 +2558,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
else
ValVT = VA.getValVT();
+ // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+ // taken by a return address.
+ int Offset = 0;
+ if (CallConv == CallingConv::X86_INTR) {
+ const X86Subtarget& Subtarget =
+ static_cast<const X86Subtarget&>(DAG.getSubtarget());
+ // X86 interrupts may take one or two arguments.
+ // On the stack there will be no return address as in regular call.
+ // Offset of last argument need to be set to -4/-8 bytes.
+ // Where offset of the first argument out of two, should be set to 0 bytes.
+ Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ }
+
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
@@ -2352,14 +2579,24 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
} else {
int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
+
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+ false, false, 0);
return ExtendedInMem ?
DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
}
@@ -2413,15 +2650,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
-SDValue
-X86TargetLowering::LowerFormalArguments(SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl,
- SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals)
- const {
+SDValue X86TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
@@ -2436,9 +2668,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
- assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
+ if (CallConv == CallingConv::X86_INTR) {
+ bool isLegal = Ins.size() == 1 ||
+ (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+ (!Is64Bit && Ins[1].VT == MVT::i32)));
+ if (!isLegal)
+ report_fatal_error("X86 interrupts may take one or two arguments");
+ }
+
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
@@ -2471,6 +2711,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = &X86::FR64RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::FR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
@@ -2547,8 +2789,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
- if (FuncIsMadeTailCallSafe(CallConv,
- MF.getTarget().Options.GuaranteedTailCallOpt))
+ if (shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
@@ -2561,13 +2803,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MFI->CreateFixedObject(1, StackSize, true));
}
- MachineModuleInfo &MMI = MF.getMMI();
- const Function *WinEHParent = nullptr;
- if (MMI.hasWinEHFuncInfo(Fn))
- WinEHParent = MMI.getWinEHParent(Fn);
- bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
- bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
-
// Figure out if XMM registers are in use.
assert(!(Subtarget->useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
@@ -2631,10 +2866,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
RSFIN, DAG.getIntPtrConstant(Offset, dl));
SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- FuncInfo->getRegSaveFrameIndex(), Offset),
- false, false, 0);
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
MemOps.push_back(Store);
Offset += 8;
}
@@ -2656,27 +2892,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
- } else if (IsWin64 && IsWinEHOutlined) {
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
- /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
-
- MMI.getWinEHFuncInfo(Fn)
- .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
- FuncInfo->getRegSaveFrameIndex();
-
- // Store the second integer parameter (rdx) into rsp+16 relative to the
- // stack pointer at the entry of the function.
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy(DAG.getDataLayout()));
- unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
- SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
- Chain = DAG.getStore(
- Val.getValue(1), dl, Val, RSFIN,
- MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
- /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
}
if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
@@ -2723,12 +2938,15 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+ // X86 interrupts must pop the error code if present
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
- if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget->getTargetTriple().isOSMSVCRT() &&
- argsAreStructReturn(Ins) == StackStructReturn)
+ argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
@@ -2743,21 +2961,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
FuncInfo->setArgumentStackSize(StackSize);
- if (IsWinEHParent) {
- if (Is64Bit) {
- int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
- SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
- MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
- SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
- Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
- MachinePointerInfo::getFixedStack(UnwindHelpFI),
- /*isVolatile=*/true,
- /*isNonTemporal=*/false, /*Alignment=*/0);
- } else {
- // Functions using Win32 EH are considered to have opaque SP adjustments
- // to force local variables to be addressed from the frame or base
- // pointers.
- MFI->setHasOpaqueSPAdjustment(true);
+ if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+ EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ if (Personality == EHPersonality::CoreCLR) {
+ assert(Is64Bit);
+ // TODO: Add a mechanism to frame lowering that will allow us to indicate
+ // that we'd prefer this slot be allocated towards the bottom of the frame
+ // (i.e. near the stack pointer after allocating the frame). Every
+ // funclet needs a copy of this slot in its (mostly empty) frame, and the
+ // offset from the bottom of this and each funclet's frame must be the
+ // same, so the size of funclets' (mostly empty) frames is dictated by
+ // how far this slot is from the bottom (since they allocate just enough
+ // space to accomodate holding this slot at the correct offset).
+ int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+ EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -2777,9 +2994,10 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
if (Flags.isByVal())
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
- return DAG.getStore(Chain, dl, Arg, PtrOff,
- MachinePointerInfo::getStack(LocMemOffset),
- false, false, 0);
+ return DAG.getStore(
+ Chain, dl, Arg, PtrOff,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+ false, false, 0);
}
/// Emit a load of return address if tail call
@@ -2813,11 +3031,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
- MachinePointerInfo::getFixedStack(NewReturnAddrFI),
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), NewReturnAddrFI),
false, false, 0);
return Chain;
}
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -2835,11 +3066,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
- StructReturnType SR = callIsStructReturn(Outs);
+ StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
bool IsSibcall = false;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+ if (CallConv == CallingConv::X86_INTR)
+ report_fatal_error("X86 interrupts may not be called directly");
+
if (Attr.getValueAsString() == "true")
isTailCall = false;
@@ -2878,7 +3112,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
++NumTailCalls;
}
- assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Analyze operands of the call, assigning locations to each operand.
@@ -2892,13 +3126,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
if (IsSibcall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- IsTailCallConvention(CallConv))
+ canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
@@ -2970,7 +3204,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
break;
case CCValAssign::AExt:
if (Arg.getValueType().isVector() &&
- Arg.getValueType().getScalarType() == MVT::i1)
+ Arg.getValueType().getVectorElementType() == MVT::i1)
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
@@ -2987,9 +3221,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Store the argument.
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
- Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0);
+ Chain = DAG.getStore(
+ Chain, dl, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, 0);
Arg = SpillSlot;
break;
}
@@ -3125,10 +3360,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
- MemOpChains2.push_back(
- DAG.getStore(ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
+ MemOpChains2.push_back(DAG.getStore(
+ ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ false, false, 0));
}
}
@@ -3207,7 +3442,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (ExtraLoad)
Callee = DAG.getLoad(
getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
+ false, 0);
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned char OpFlags = 0;
@@ -3261,9 +3497,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
- // If this is an invoke in a 32-bit function using an MSVC personality, assume
- // the function clobbers all registers. If an exception is thrown, the runtime
- // will not restore CSRs.
+ // If this is an invoke in a 32-bit function using a funclet-based
+ // personality, assume the function clobbers all registers. If an exception
+ // is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
@@ -3272,7 +3508,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallerFn->hasPersonalityFn()
? classifyEHPersonality(CallerFn->getPersonalityFn())
: EHPersonality::Unknown;
- if (isMSVCEHPersonality(Pers))
+ if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
}
@@ -3300,7 +3536,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
- else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget->getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
@@ -3358,8 +3594,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// EDI
// local1 ..
-/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
-/// for a 16 byte align requirement.
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
@@ -3380,9 +3616,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
return Offset;
}
-/// MatchingStackOffset - Return true if the given stack call argument is
-/// already available in the same position (relatively) of the caller's
-/// incoming argument stack.
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
@@ -3435,25 +3670,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
}
-/// IsEligibleForTailCallOptimization - Check whether the call is eligible
-/// for tail call optimization. Targets which want to do tail call
-/// optimization should implement this function.
-bool
-X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
- Type *RetTy,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG &DAG) const {
- if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
- const MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
@@ -3474,7 +3703,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
return false;
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
- if (IsTailCallConvention(CalleeCC) && CCMatch)
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
}
@@ -3493,19 +3722,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (isCalleeStructRet || isCallerStructRet)
return false;
- // An stdcall/thiscall caller is expected to clean up its arguments; the
- // callee isn't going to do that.
- // FIXME: this is more restrictive than needed. We could produce a tailcall
- // when the stack adjustment matches. For example, with a thiscall that takes
- // only one argument.
- if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
- CallerCC == CallingConv::X86_ThisCall))
- return false;
-
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
if (isVarArg && !Outs.empty()) {
-
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
if (IsCalleeWin64 || IsCallerWin64)
@@ -3573,6 +3792,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
}
}
+ unsigned StackArgsSize = 0;
+
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
@@ -3587,11 +3808,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
- if (CCInfo.getNextStackOffset()) {
- MachineFunction &MF = DAG.getMachineFunction();
- if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
- return false;
+ StackArgsSize = CCInfo.getNextStackOffset();
+ if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -3642,6 +3861,21 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
}
}
+ bool CalleeWillPop =
+ X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
return true;
}
@@ -3688,11 +3922,13 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VPERMILPI:
case X86ISD::VPERM2X128:
case X86ISD::VPERMI:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
return true;
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue V1, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3707,7 +3943,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
}
}
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue V1, SDValue V2, SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3772,23 +4008,23 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
return false;
}
-/// isCalleePop - Determines whether the callee is required to pop its
-/// own arguments. Callee pop is necessary to support tail calls.
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
- bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+ // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+ // can guarantee TCO.
+ if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+ return true;
+
switch (CallingConv) {
default:
return false;
case CallingConv::X86_StdCall:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
return !is64Bit;
- case CallingConv::Fast:
- case CallingConv::GHC:
- case CallingConv::HiPE:
- if (IsVarArg)
- return false;
- return TailCallOpt;
}
}
@@ -3807,11 +4043,26 @@ static bool isX86CCUnsigned(unsigned X86CC) {
case X86::COND_BE: return true;
case X86::COND_AE: return true;
}
- llvm_unreachable("covered switch fell through?!");
}
-/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
-/// specific condition code, returning the condition code and the LHS/RHS of the
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
@@ -3833,19 +4084,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
}
}
- switch (SetCCOpcode) {
- default: llvm_unreachable("Invalid integer condition!");
- case ISD::SETEQ: return X86::COND_E;
- case ISD::SETGT: return X86::COND_G;
- case ISD::SETGE: return X86::COND_GE;
- case ISD::SETLT: return X86::COND_L;
- case ISD::SETLE: return X86::COND_LE;
- case ISD::SETNE: return X86::COND_NE;
- case ISD::SETULT: return X86::COND_B;
- case ISD::SETUGT: return X86::COND_A;
- case ISD::SETULE: return X86::COND_BE;
- case ISD::SETUGE: return X86::COND_AE;
- }
+ return TranslateIntegerX86CC(SetCCOpcode);
}
// First determine if it is required or is profitable to flip the operands.
@@ -3898,8 +4137,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
}
}
-/// hasFPCMov - is there a floating point cmov for the specific X86 condition
-/// code. Current x86 isa includes the following FP cmov instructions:
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
switch (X86CC) {
@@ -3917,7 +4156,7 @@ static bool hasFPCMov(unsigned X86CC) {
}
}
-/// isFPImmLegal - Returns true if the target can instruction select the
+/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
@@ -3970,7 +4209,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget->hasLZCNT();
}
-/// isUndefInRange - Return true if every element in Mask, beginning
+/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size is undef.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
@@ -3979,19 +4218,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return true;
}
-/// isUndefOrInRange - Return true if Val is undef or if its value falls within
-/// the specified range (L, H].
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) || (Val >= Low && Val < Hi);
}
-/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
-/// specified value.
+/// Val is either less than zero (undef) or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return (Val < 0 || Val == CmpVal);
}
-/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
+/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
@@ -4002,9 +4240,8 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
return true;
}
-/// isVEXTRACTIndex - Return true if the specified
-/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-/// suitable for instruction that extract 128 or 256 bit vectors
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4021,7 +4258,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
return Result;
}
-/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
+/// Return true if the specified INSERT_SUBVECTOR
/// operand specifies a subvector insert that is suitable for input to
/// insertion of 128 or 256-bit subvectors
static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
@@ -4057,8 +4294,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
- if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
- llvm_unreachable("Illegal extract subvector for VEXTRACT");
+ assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+ "Illegal extract subvector for VEXTRACT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
@@ -4072,8 +4309,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
- if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
- llvm_unreachable("Illegal insert subvector for VINSERT");
+ assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+ "Illegal insert subvector for VINSERT");
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
@@ -4085,53 +4322,71 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
return Index / NumElemsPerChunk;
}
-/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
-/// and VINSERTI128 instructions.
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
return getExtractVEXTRACTImmediate(N, 128);
}
-/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
-/// and VINSERTI64x4 instructions.
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
return getExtractVEXTRACTImmediate(N, 256);
}
-/// getInsertVINSERT128Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
-/// and VINSERTI128 instructions.
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
return getInsertVINSERTImmediate(N, 128);
}
-/// getInsertVINSERT256Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
-/// and VINSERTI64x4 instructions.
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
return getInsertVINSERTImmediate(N, 256);
}
-/// isZero - Returns true if Elt is a constant integer zero
-static bool isZero(SDValue V) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
- return C && C->isNullValue();
-}
-
-/// isZeroNode - Returns true if Elt is a constant zero or a floating point
-/// constant +0.0.
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
- if (isZero(Elt))
- return true;
- if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
- return CFP->getValueAPF().isPosZero();
- return false;
+ return isNullConstant(Elt) || isNullFPConstant(Elt);
}
-/// getZeroVector - Returns a vector of specified type with all zero elements.
-///
-static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
+ SelectionDAG &DAG,
+ SDLoc dl, bool IsMask = false) {
+
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsUndef = Values[i] < 0 && IsMask;
+ SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(Values[i], dl, EltVT);
+ Ops.push_back(OpNode);
+ if (Split)
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(0, dl, EltVT));
+ }
+ SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+ if (Split)
+ ConstsNode = DAG.getBitcast(VT, ConstsNode);
+ return ConstsNode;
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG, SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
@@ -4163,7 +4418,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
- } else if (VT.getScalarType() == MVT::i1) {
+ } else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
&& "Unexpected vector type");
@@ -4195,19 +4450,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
- * ElemsPerChunk);
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
- ElemsPerChunk));
+ makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
@@ -4245,13 +4499,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
// Insert the relevant vectorWidth bits.
unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
- * ElemsPerChunk);
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
@@ -4279,7 +4533,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
Vec, ZeroIndex);
// The blend instruction, and therefore its mask, depend on the data type.
- MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+ MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
if (ScalarType.isFloatingPoint()) {
// Choose either vblendps (float) or vblendpd (double).
unsigned ScalarSize = ScalarType.getSizeInBits();
@@ -4316,6 +4570,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}
+/// Insert i1-subvector to i1-vector.
+static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
+ return Op;
+
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+ unsigned NumElems = OpVT.getVectorNumElements();
+ unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+ assert(IdxVal + SubVecNumElems <= NumElems &&
+ IdxVal % SubVecVT.getSizeInBits() == 0 &&
+ "Unexpected index value in INSERT_SUBVECTOR");
+
+ // There are 3 possible cases:
+ // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+ // 2. Subvector should be inserted in the upper part
+ // (IdxVal + SubVecNumElems == NumElems)
+ // 3. Subvector should be inserted in the middle (for example v2i1
+ // to v16i1, index 2)
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(OpVT);
+ SDValue WideSubVec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
+ if (Vec.isUndef())
+ return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
+ DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+ }
+
+ if (IdxVal == 0) {
+ // Zero lower bits of the Vec
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ // Merge them together
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ }
+
+ // Simple case when we put subvector in the upper part
+ if (IdxVal + SubVecNumElems == NumElems) {
+ // Zero upper bits of the Vec
+ WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+ }
+ // Subvector should be inserted in the middle - use shuffle
+ SmallVector<int, 64> Mask;
+ for (unsigned i = 0; i < NumElems; ++i)
+ Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+ i : i + NumElems);
+ return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
/// instructions. This is used because creating CONCAT_VECTOR nodes of
/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
@@ -4334,18 +4663,22 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-/// getOnesVector - Returns a vector of specified type with all bits set.
+/// Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
/// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
SDValue Vec;
- if (VT.is256BitVector()) {
- if (HasInt256) { // AVX2
+ if (VT.is512BitVector()) {
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+ Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+ } else if (VT.is256BitVector()) {
+ if (Subtarget->hasInt256()) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else { // AVX
@@ -4360,19 +4693,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
return DAG.getBitcast(VT, Vec);
}
-/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
-/// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
- SDValue V2) {
- unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask;
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i)
- Mask.push_back(i);
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
-}
-
-/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
@@ -4384,7 +4705,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
+/// Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
@@ -4396,10 +4717,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
-/// vector of zero or undef vector. This produces a shuffle where the low
-/// element of V2 is swizzled into the zero/undef vector, landing at element
-/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
bool IsZero,
const X86Subtarget *Subtarget,
@@ -4415,10 +4736,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
}
-/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated. Sets
-/// IsUnary to true if only uses one source. Note that this will set IsUnary for
-/// shuffles which use a single input multiple times, and in those cases it will
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
+/// uses one source. Note that this will set IsUnary for shuffles which use a
+/// single input multiple times, and in those cases it will
/// adjust the mask to only have indices within that single input.
/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
static bool getTargetShuffleMask(SDNode *N, MVT VT,
@@ -4482,7 +4803,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
// If we have a build-vector, then things are easy.
- EVT VT = MaskNode.getValueType();
+ MVT VT = MaskNode.getSimpleValueType();
assert(VT.isVector() &&
"Can't produce a non-vector with a build_vector!");
if (!VT.isInteger())
@@ -4572,6 +4893,119 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
case X86ISD::MOVLPS:
// Not yet implemented
return false;
+ case X86ISD::VPERMV: {
+ IsUnary = true;
+ SDValue MaskNode = N->getOperand(0);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(0);
+
+ unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
+ SmallVector<uint64_t, 32> RawMask;
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ assert(MaskNode.getSimpleValueType().isInteger() &&
+ MaskNode.getSimpleValueType().getVectorNumElements() ==
+ VT.getVectorNumElements());
+
+ for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF)
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ else if (isa<ConstantSDNode>(Op)) {
+ APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
+ RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+ } else
+ return false;
+ }
+ DecodeVPERMVMask(RawMask, Mask);
+ break;
+ }
+ if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
+ unsigned NumEltsInMask = MaskNode->getNumOperands();
+ MaskNode = MaskNode->getOperand(0);
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
+ APInt MaskEltValue = CN->getAPIntValue();
+ for (unsigned i = 0; i < NumEltsInMask; ++i)
+ RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
+ DecodeVPERMVMask(RawMask, Mask);
+ break;
+ }
+ // It may be a scalar load
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ DecodeVPERMVMask(C, VT, Mask);
+ if (Mask.empty())
+ return false;
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMV3: {
+ IsUnary = false;
+ SDValue MaskNode = N->getOperand(1);
+ while (MaskNode->getOpcode() == ISD::BITCAST)
+ MaskNode = MaskNode->getOperand(1);
+
+ if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+ // If we have a build-vector, then things are easy.
+ assert(MaskNode.getSimpleValueType().isInteger() &&
+ MaskNode.getSimpleValueType().getVectorNumElements() ==
+ VT.getVectorNumElements());
+
+ SmallVector<uint64_t, 32> RawMask;
+ unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
+
+ for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF)
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ else {
+ auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+ if (!CN)
+ return false;
+ APInt MaskElement = CN->getAPIntValue();
+ RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+ }
+ }
+ DecodeVPERMV3Mask(RawMask, Mask);
+ break;
+ }
+
+ auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+ if (!MaskLoad)
+ return false;
+
+ SDValue Ptr = MaskLoad->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+ return false;
+
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+ DecodeVPERMV3Mask(C, VT, Mask);
+ if (Mask.empty())
+ return false;
+ break;
+ }
+ return false;
+ }
default: llvm_unreachable("unknown target shuffle node");
}
@@ -4586,7 +5020,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
return true;
}
-/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// Returns the scalar element that will make up the ith
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
unsigned Depth) {
@@ -4650,8 +5084,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
return SDValue();
}
-/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
-///
+/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
@@ -4721,8 +5154,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
return DAG.getBitcast(MVT::v16i8, V);
}
-/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
-///
+/// Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
@@ -4753,7 +5185,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return V;
}
-/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
@@ -4924,7 +5356,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
return SDValue();
if ((Offset % RequiredAlign) & 3)
return SDValue();
- int64_t StartOffset = Offset & ~(RequiredAlign-1);
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -5157,8 +5589,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- const Function *F = DAG.getMachineFunction().getFunction();
- bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -5188,9 +5619,10 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+ false, false, Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
@@ -5329,7 +5761,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
return NV;
}
-static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) {
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector");
@@ -5366,7 +5798,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
}
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
+ SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
if (Imm.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, Imm);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
@@ -5600,7 +6032,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
/// node.
static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- EVT VT = BV->getValueType(0);
+ MVT VT = BV->getSimpleValueType(0);
if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
(!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
return SDValue();
@@ -5662,12 +6094,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
// Update InVec0 and InVec1.
if (InVec0.getOpcode() == ISD::UNDEF) {
InVec0 = Op0.getOperand(0);
- if (InVec0.getValueType() != VT)
+ if (InVec0.getSimpleValueType() != VT)
return SDValue();
}
if (InVec1.getOpcode() == ISD::UNDEF) {
InVec1 = Op1.getOperand(0);
- if (InVec1.getValueType() != VT)
+ if (InVec1.getSimpleValueType() != VT)
return SDValue();
}
@@ -5703,7 +6135,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- EVT VT = BV->getValueType(0);
+ MVT VT = BV->getSimpleValueType(0);
unsigned NumElts = VT.getVectorNumElements();
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
@@ -5845,7 +6277,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
- if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+ if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
return LowerBUILD_VECTORvXi1(Op, DAG);
// Vectors containing all zeros can be matched by pxor and xorps later
@@ -5866,7 +6298,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return Op;
if (!VT.is512BitVector())
- return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+ return getOnesVector(VT, Subtarget, DAG, dl);
}
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
@@ -5881,7 +6313,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned NumZero = 0;
unsigned NumNonZero = 0;
- unsigned NonZeros = 0;
+ uint64_t NonZeros = 0;
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
for (unsigned i = 0; i < NumElems; ++i) {
@@ -5895,7 +6327,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (X86::isZeroNode(Elt))
NumZero++;
else {
- NonZeros |= (1 << i);
+ assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+ NonZeros |= ((uint64_t)1 << i);
NumNonZero++;
}
}
@@ -5919,7 +6352,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
- EVT VecVT = MVT::v4i32;
+ MVT VecVT = MVT::v4i32;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
@@ -6051,7 +6484,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// One half is zero or undef.
unsigned Idx = countTrailingZeros(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
- Op.getOperand(Idx));
+ Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
}
return SDValue();
@@ -6059,13 +6492,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
- if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this))
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
return V;
if (EVTBits == 16 && NumElems == 8)
- if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this))
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+ DAG, Subtarget, *this))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
@@ -6077,7 +6510,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
- bool isZero = !(NonZeros & (1 << i));
+ bool isZero = !(NonZeros & (1ULL << i));
if (isZero)
V[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
@@ -6177,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -6193,8 +6626,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
if (Op.getNumOperands() == 4) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
- ResVT.getVectorNumElements()/2);
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+ ResVT.getVectorNumElements()/2);
SDValue V3 = Op.getOperand(2);
SDValue V4 = Op.getOperand(3);
return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
@@ -6213,8 +6646,27 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(isPowerOf2_32(NumOfOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
+ SDValue Undef = DAG.getUNDEF(ResVT);
if (NumOfOperands > 2) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+ // Specialize the cases when all, or all but one, of the operands are undef.
+ unsigned NumOfDefinedOps = 0;
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < NumOfOperands; i++)
+ if (!Op.getOperand(i).isUndef()) {
+ NumOfDefinedOps++;
+ OpIdx = i;
+ }
+ if (NumOfDefinedOps == 0)
+ return Undef;
+ if (NumOfDefinedOps == 1) {
+ unsigned SubVecNumElts =
+ Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+ SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+ Op.getOperand(OpIdx), IdxVal);
+ }
+
+ MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
SmallVector<SDValue, 2> Ops;
for (unsigned i = 0; i < NumOfOperands/2; i++)
@@ -6227,31 +6679,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
+ // 2 operands
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ assert(V1.getValueType() == V2.getValueType() &&
+ V1.getValueType().getVectorNumElements() == NumElems/2 &&
+ "Unexpected operands in CONCAT_VECTORS");
+
+ if (ResVT.getSizeInBits() >= 16)
+ return Op; // The operation is legal with KUNPCK
+
bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
-
+ SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
if (IsZeroV1 && IsZeroV2)
- return getZeroVector(ResVT, Subtarget, DAG, dl);
+ return ZeroVec;
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
- SDValue Undef = DAG.getUNDEF(ResVT);
- unsigned NumElems = ResVT.getVectorNumElements();
- SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
+ if (V2.isUndef())
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ if (IsZeroV2)
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+ SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+ if (V1.isUndef())
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
- V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
- V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
if (IsZeroV1)
- return V2;
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
- // Zero the upper bits of V1
- V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
- V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
- if (IsZeroV2)
- return V1;
- return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
}
static SDValue LowerCONCAT_VECTORS(SDValue Op,
@@ -6272,7 +6731,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
-
//===----------------------------------------------------------------------===//
// Vector shuffle lowering
//
@@ -6422,6 +6880,127 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
return DAG.getConstant(Imm, DL, MVT::i8);
}
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ SmallBitVector Zeroable(Mask.size(), false);
+
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ Zeroable[i] = true;
+ continue;
+ }
+
+ // If this is an index into a build_vector node (which has the same number
+ // of elements), dig out the input value and use it.
+ SDValue V = M < Size ? V1 : V2;
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ continue;
+
+ SDValue Input = V.getOperand(M % Size);
+ // The UNDEF opcode check really should be dead code here, but not quite
+ // worth asserting on (it isn't invalid, just unexpected).
+ if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+ Zeroable[i] = true;
+ }
+
+ return Zeroable;
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 8> Unpckl;
+ SmallVector<int, 8> Unpckh;
+
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
+ int HiPos = LoPos + NumEltsInLane / 2;
+ Unpckl.push_back(LoPos);
+ Unpckh.push_back(HiPos);
+ }
+
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+ // Commute and try again.
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+ return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getVectorElementType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getBitcast(EltVT, Zero);
+ AllOnes = DAG.getBitcast(EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+
/// \brief Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
@@ -6431,7 +7010,7 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
- MVT EltVT = VT.getScalarType();
+ MVT EltVT = VT.getVectorElementType();
int NumEltBits = EltVT.getSizeInBits();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
@@ -6458,22 +7037,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is in fact a blend.
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
+ SDValue V2, ArrayRef<int> Original,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ SmallVector<int, 8> Mask(Original.begin(), Original.end());
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ bool ForceV1Zero = false, ForceV2Zero = false;
+
+ // Attempt to generate the binary blend mask. If an input is zero then
+ // we can use any lane.
+ // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] >= Size) {
- if (Mask[i] != i + Size)
- return SDValue(); // Shuffled V2 input!
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (M == i)
+ continue;
+ if (M == i + Size) {
BlendMask |= 1u << i;
continue;
}
- if (Mask[i] >= 0 && Mask[i] != i)
- return SDValue(); // Shuffled V1 input!
+ if (Zeroable[i]) {
+ if (V1IsZero) {
+ ForceV1Zero = true;
+ Mask[i] = i;
+ continue;
+ }
+ if (V2IsZero) {
+ ForceV2Zero = true;
+ BlendMask |= 1u << i;
+ Mask[i] = i + Size;
+ continue;
+ }
+ }
+ return SDValue(); // Shuffled input!
}
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+ unsigned ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1u << i))
+ for (int j = 0; j != Scale; ++j)
+ ScaledMask |= 1u << (i * Scale + j);
+ return ScaledMask;
+ };
+
switch (VT.SimpleTy) {
case MVT::v2f64:
case MVT::v4f32:
@@ -6493,12 +7112,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
if (Subtarget->hasAVX2()) {
// Scale the blend by the number of 32-bit dwords per element.
int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
@@ -6511,12 +7125,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
int Scale = 8 / VT.getVectorNumElements();
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
V1 = DAG.getBitcast(MVT::v8i16, V1);
V2 = DAG.getBitcast(MVT::v8i16, V2);
return DAG.getBitcast(VT,
@@ -6541,9 +7150,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
// FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
- assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
+ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+ return Masked;
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
@@ -6760,11 +7373,11 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
Hi = DAG.getBitcast(AlignVT, Hi);
return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
}
- assert(VT.getSizeInBits() == 128 &&
+ assert(VT.is128BitVector() &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
@@ -6785,92 +7398,6 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
-/// \brief Compute whether each element of a shuffle is zeroable.
-///
-/// A "zeroable" vector shuffle element is one which can be lowered to zero.
-/// Either it is an undef element in the shuffle mask, the element of the input
-/// referenced is undef, or the element of the input referenced is known to be
-/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
-/// as many lanes with this technique as possible to simplify the remaining
-/// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
- SDValue V1, SDValue V2) {
- SmallBitVector Zeroable(Mask.size(), false);
-
- while (V1.getOpcode() == ISD::BITCAST)
- V1 = V1->getOperand(0);
- while (V2.getOpcode() == ISD::BITCAST)
- V2 = V2->getOperand(0);
-
- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- int M = Mask[i];
- // Handle the easy cases.
- if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
- Zeroable[i] = true;
- continue;
- }
-
- // If this is an index into a build_vector node (which has the same number
- // of elements), dig out the input value and use it.
- SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
- continue;
-
- SDValue Input = V.getOperand(M % Size);
- // The UNDEF opcode check really should be dead code here, but not quite
- // worth asserting on (it isn't invalid, just unexpected).
- if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
- Zeroable[i] = true;
- }
-
- return Zeroable;
-}
-
-/// \brief Try to emit a bitmask instruction for a shuffle.
-///
-/// This handles cases where we can model a blend exactly as a bitmask due to
-/// one of the inputs being zeroable.
-static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
- MVT EltVT = VT.getScalarType();
- int NumEltBits = EltVT.getSizeInBits();
- MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
- SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
- SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
- IntEltVT);
- if (EltVT.isFloatingPoint()) {
- Zero = DAG.getBitcast(EltVT, Zero);
- AllOnes = DAG.getBitcast(EltVT, AllOnes);
- }
- SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- SDValue V;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Zeroable[i])
- continue;
- if (Mask[i] % Size != i)
- return SDValue(); // Not a blend.
- if (!V)
- V = Mask[i] < Size ? V1 : V2;
- else if (V != (Mask[i] < Size ? V1 : V2))
- return SDValue(); // Can only let one input through the mask.
-
- VMaskOps[i] = AllOnes;
- }
- if (!V)
- return SDValue(); // No non-zeroable elements!
-
- SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
- V = DAG.getNode(VT.isFloatingPoint()
- ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
- DL, VT, V, VMask);
- return V;
-}
-
/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -6982,7 +7509,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
// Determine the extraction length from the part of the
// lower half that isn't zeroable.
int Len = HalfSize;
- for (; Len >= 0; --Len)
+ for (; Len > 0; --Len)
if (!Zeroable[Len - 1])
break;
assert(Len > 0 && "Zeroable shuffle mask");
@@ -6997,8 +7524,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
SDValue &V = (M < Size ? V1 : V2);
M = M % Size;
- // All mask elements must be in the lower half.
- if (M > HalfSize)
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
return SDValue();
if (Idx < 0 || (Src == V && Idx == (M - i))) {
@@ -7095,64 +7623,104 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
///
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
-/// features of the subtarget.
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int NumElements = VT.getVectorNumElements();
int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = 128 / EltBits;
+ int OffsetLane = Offset / NumEltsPerLane;
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+ assert(0 <= Offset && "Extension offset must be positive.");
+ assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+ "Extension offset must be in the first lane or start an upper lane.");
+
+ // Check that an index is in same lane as the base offset.
+ auto SafeOffset = [&](int Idx) {
+ return OffsetLane == (Idx / NumEltsPerLane);
+ };
+
+ // Shift along an input so that the offset base moves to the first element.
+ auto ShuffleOffset = [&](SDValue V) {
+ if (!Offset)
+ return V;
+
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = 0; i * Scale < NumElements; ++i) {
+ int SrcIdx = i + Offset;
+ ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+ }
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+ };
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
+ // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // PUNPCK will catch this in a later shuffle match.
+ if (Offset && Scale == 2 && VT.is128BitVector())
+ return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+ return DAG.getBitcast(VT, InputV);
}
+ assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
if (AnyExt && EltBits == 32) {
- int PSHUFDMask[4] = {0, -1, 1, -1};
+ int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+ -1};
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
if (AnyExt && EltBits == 16 && Scale > 2) {
- int PSHUFDMask[4] = {0, -1, 0, -1};
+ int PSHUFDMask[4] = {Offset / 2, -1,
+ SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
- int PSHUFHWMask[4] = {1, -1, -1, -1};
+ int PSHUFWMask[4] = {1, -1, -1, -1};
+ unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+ VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
- getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+ getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
}
// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
// to 64-bits.
if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
- assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+ assert(VT.is128BitVector() && "Unexpected vector width!");
+ int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(0, DL, MVT::i8)));
- if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+ DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+ if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+ !SafeOffset(Offset + 1))
return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
- SDValue Hi =
- DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
- DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(EltBits, DL, MVT::i8)));
+ int HiIdx = (Offset + 1) * EltBits;
+ SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(HiIdx, DL, MVT::i8)));
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
@@ -7163,9 +7731,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
- for (int i = 0; i < 16; ++i)
- PSHUFBMask[i] =
- DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
+ for (int i = 0; i < 16; ++i) {
+ int Idx = Offset + (i / Scale);
+ PSHUFBMask[i] = DAG.getConstant(
+ (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ }
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
@@ -7173,13 +7743,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
MVT::v16i8, PSHUFBMask)));
}
+ // If we are extending from an offset, ensure we start on a boundary that
+ // we can unpack from.
+ int AlignToUnpack = Offset % (NumElements / Scale);
+ if (AlignToUnpack) {
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = AlignToUnpack; i < NumElements; ++i)
+ ShMask[i - AlignToUnpack] = i;
+ InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+ Offset -= AlignToUnpack;
+ }
+
// Otherwise emit a sequence of unpacks.
do {
+ unsigned UnpackLoHi = X86ISD::UNPCKL;
+ if (Offset >= (NumElements / 2)) {
+ UnpackLoHi = X86ISD::UNPCKH;
+ Offset -= (NumElements / 2);
+ }
+
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
InputV = DAG.getBitcast(InputVT, InputV);
- InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+ InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
Scale /= 2;
EltBits *= 2;
NumElements /= 2;
@@ -7205,7 +7792,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
+ int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElements / NumLanes;
assert(VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit");
assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
@@ -7215,8 +7804,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
auto Lower = [&](int Scale) -> SDValue {
SDValue InputV;
bool AnyExt = true;
+ int Offset = 0;
+ int Matches = 0;
for (int i = 0; i < NumElements; ++i) {
- if (Mask[i] == -1)
+ int M = Mask[i];
+ if (M == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
// Each of the extended elements need to be zeroable.
@@ -7230,14 +7822,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
// Each of the base elements needs to be consecutive indices into the
// same input vector.
- SDValue V = Mask[i] < NumElements ? V1 : V2;
- if (!InputV)
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
InputV = V;
- else if (InputV != V)
+ Offset = M - (i / Scale);
+ } else if (InputV != V)
return SDValue(); // Flip-flopping inputs.
- if (Mask[i] % NumElements != i / Scale)
+ // Offset must start in the lowest 128-bit lane or at the start of an
+ // upper lane.
+ // FIXME: Is it ever worth allowing a negative base offset?
+ if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+ (Offset % NumEltsPerLane) == 0))
+ return SDValue();
+
+ // If we are offsetting, all referenced entries must come from the same
+ // lane.
+ if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+ return SDValue();
+
+ if ((M % NumElements) != (Offset + (i / Scale)))
return SDValue(); // Non-consecutive strided elements.
+ Matches++;
}
// If we fail to find an input, we have a zero-shuffle which should always
@@ -7246,8 +7853,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (!InputV)
return SDValue();
+ // If we are offsetting, don't extend if we only match a single input, we
+ // can always do better by using a basic PSHUF or PUNPCK.
+ if (Offset != 0 && Matches < 2)
+ return SDValue();
+
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
+ DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -7355,8 +7967,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
// all the smarts here sunk into that routine. However, the current
// lowering of BUILD_VECTOR makes that nearly impossible until the old
// vector shuffle lowering is dead.
- if (SDValue V2S = getScalarValueForVectorElement(
- V2, Mask[V2Index] - Mask.size(), DAG)) {
+ SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+ DAG);
+ if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
// We need to zext the scalar if it is smaller than an i32.
V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || EltVT == MVT::i16) {
@@ -7431,11 +8044,65 @@ static SDValue lowerVectorShuffleAsElementInsertion(
return V2;
}
+/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
+ int BroadcastIdx,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX2() &&
+ "We can only lower integer broadcasts with AVX2!");
+
+ EVT EltVT = VT.getVectorElementType();
+ EVT V0VT = V0.getValueType();
+
+ assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+ assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+ EVT V0EltVT = V0VT.getVectorElementType();
+ if (!V0EltVT.isInteger())
+ return SDValue();
+
+ const unsigned EltSize = EltVT.getSizeInBits();
+ const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+ // This is only a truncation if the original element type is larger.
+ if (V0EltSize <= EltSize)
+ return SDValue();
+
+ assert(((V0EltSize % EltSize) == 0) &&
+ "Scalar type sizes must all be powers of 2 on x86!");
+
+ const unsigned V0Opc = V0.getOpcode();
+ const unsigned Scale = V0EltSize / EltSize;
+ const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+ if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+ V0Opc != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+ // If we're extracting non-least-significant bits, shift so we can truncate.
+ // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+ // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+ // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+ if (const int OffsetIdx = BroadcastIdx % Scale)
+ Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+ DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+ DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
/// \brief Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
@@ -7476,7 +8143,7 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
int BeginIdx = (int)ConstantIdx->getZExtValue();
int EndIdx =
- BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+ BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
BroadcastIdx -= BeginIdx;
V = VInner;
@@ -7491,6 +8158,15 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
+ // First, look through bitcast: if the original value has a larger element
+ // type than the shuffle, the broadcast element is in essence truncated.
+ // Make that explicit to ease folding.
+ if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+ if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+ DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+ return TruncBroadcast;
+
+ // Also check the simpler case, where we can directly reuse the scalar.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
@@ -7499,6 +8175,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
// Only AVX2 has register broadcasts.
if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
return SDValue();
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+ // If we are broadcasting a load that is only used by the shuffle
+ // then we can reduce the vector load to the broadcasted scalar load.
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
+ SDValue BaseAddr = Ld->getOperand(1);
+ EVT AddrVT = BaseAddr.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, DL, AddrVT, BaseAddr,
+ DAG.getConstant(Offset, DL, AddrVT));
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
// We can't broadcast from a vector register without AVX2, and we can only
// broadcast from the zero-element of a vector register.
@@ -7595,9 +8285,10 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
"This routine only supports integer vectors.");
assert(!isSingleInputShuffleMask(Mask) &&
@@ -7774,10 +8465,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
@@ -7869,10 +8559,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8077,14 +8766,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ return V;
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
@@ -8161,14 +8845,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
@@ -8184,8 +8863,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
+ V2, Mask, DAG))
return Unpack;
// We implement this with SHUFPS because it can blend from two vectors.
@@ -8218,7 +8897,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
@@ -8286,16 +8965,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
assert(AToAInputs.size() + BToAInputs.size() == 4 &&
"Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+ bool ThreeAInputs = AToAInputs.size() == 3;
+
// Compute the index of dword with only one word among the three inputs in
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
int ADWord, BDWord;
- int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
- int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
- int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
- ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
- int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
+ int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+ int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+ int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+ ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+ int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
int TripleNonInputIdx =
TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
@@ -8364,8 +9045,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
} else {
assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
- int APinnedIdx =
- AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+ int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
}
}
@@ -8751,10 +9431,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
- if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
@@ -8798,10 +9477,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
@@ -8812,8 +9490,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
- if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+ V2, Mask, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
@@ -9037,17 +9715,14 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return V;
}
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Masked;
+
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
- 0, 16, 1, 17, 2, 18, 3, 19,
- // High half.
- 4, 20, 5, 21, 6, 22, 7, 23}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
- 8, 24, 9, 25, 10, 26, 11, 27,
- // High half.
- 12, 28, 13, 29, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
@@ -9086,8 +9761,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
- if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ DL, MVT::v16i8, V1, V2, Mask, DAG))
return Unpack;
}
@@ -9296,7 +9971,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
int NumElements = VT.getVectorNumElements();
int SplitNumElements = NumElements / 2;
- MVT ScalarVT = VT.getScalarType();
+ MVT ScalarVT = VT.getVectorElementType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
// Rather than splitting build-vectors, just build two narrower build
@@ -9308,7 +9983,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
MVT OrigVT = V.getSimpleValueType();
int OrigNumElements = OrigVT.getVectorNumElements();
int OrigSplitNumElements = OrigNumElements / 2;
- MVT OrigScalarVT = OrigVT.getScalarType();
+ MVT OrigScalarVT = OrigVT.getVectorElementType();
MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
SDValue LoV, HiV;
@@ -9478,7 +10153,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
- assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+ assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int LaneSize = Mask.size() / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
@@ -9682,6 +10357,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
}
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+ bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+ bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+ if (!UndefLower && !UndefUpper)
+ return SDValue();
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (UndefUpper &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (UndefLower && Subtarget->hasAVX2() &&
+ (VT == MVT::v4f64 || VT == MVT::v4i64))
+ return SDValue();
+
+ // If the shuffle only uses the lower halves of the input operands,
+ // then extract them and perform the 'half' shuffle at half width.
+ // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+ int HalfIdx1 = -1, HalfIdx2 = -1;
+ SmallVector<int, 8> HalfMask;
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + Offset];
+ if (M < 0) {
+ HalfMask.push_back(M);
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Only shuffle using the lower halves of the inputs.
+ // TODO: Investigate usefulness of shuffling with upper halves.
+ if (HalfIdx != 0 && HalfIdx != 2)
+ return SDValue();
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+ HalfMask.push_back(HalfElt);
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+ HalfMask.push_back(HalfElt + HalfNumElts);
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return SDValue();
+ }
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ auto GetHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ SDValue Half1 = GetHalfVector(HalfIdx1);
+ SDValue Half2 = GetHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
/// \brief Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
@@ -9776,16 +10553,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG);
}
- // X86 has dedicated unpack instructions that can handle specific blend
- // operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG))
@@ -9876,14 +10647,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -9941,14 +10707,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends. We also need to squash the
@@ -9974,9 +10735,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (Subtarget->hasAVX2())
return DAG.getNode(
X86ISD::VPERMV, DL, MVT::v8f32,
- DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
- MVT::v8i32, VPermMask)),
- V1);
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10041,14 +10800,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
}
// Try to use shift instructions.
@@ -10115,18 +10869,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane:
- 0, 16, 1, 17, 2, 18, 3, 19,
- // Second 128-bit lane:
- 8, 24, 9, 25, 10, 26, 11, 27}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane:
- 4, 20, 5, 21, 6, 22, 7, 23,
- // Second 128-bit lane:
- 12, 28, 13, 29, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
// Try to use shift instructions.
if (SDValue Shift =
@@ -10215,22 +10960,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- // Note that these are repeated 128-bit lane unpacks, not unpacks across all
- // 256-bit lanes.
- if (isShuffleEquivalent(
- V1, V2, Mask,
- {// First 128-bit lane:
- 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
- // Second 128-bit lane:
- 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
- if (isShuffleEquivalent(
- V1, V2, Mask,
- {// First 128-bit lane:
- 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
- // Second 128-bit lane:
- 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
// Try to use shift instructions.
if (SDValue Shift =
@@ -10296,12 +11028,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, VT, V1, V2, Mask, Subtarget, DAG))
return Insertion;
- // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
- // check for those subtargets here and avoid much of the subtarget querying in
- // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
- // ability to manipulate a 256-bit vector with integer types. Since we'll use
- // floating point types there eventually, just immediately cast everything to
- // a float and operate entirely in that domain.
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+ // can check for those subtargets here and avoid much of the subtarget
+ // querying in the per-vector-type lowering routines. With AVX1 we have
+ // essentially *zero* ability to manipulate a 256-bit vector with integer
+ // types. Since we'll use floating point types there eventually, just
+ // immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget->hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32)
@@ -10334,6 +11071,57 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
}
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
+ // Form a 128-bit permutation.
+ // Convert the 64-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ unsigned PermMask = 0, Imm = 0;
+ unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+
+ // Use first element in place of undef mask.
+ Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+ PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+
+ assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
+
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+ SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
@@ -10345,21 +11133,21 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- // X86 has dedicated unpack instructions that can handle specific blend
- // operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Shuf128;
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
@@ -10367,22 +11155,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane.
- 0, 16, 1, 17, 4, 20, 5, 21,
- // Second 128-bit lane.
- 8, 24, 9, 25, 12, 28, 13, 29}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane.
- 2, 18, 3, 19, 6, 22, 7, 23,
- // Second 128-bit lane.
- 10, 26, 11, 27, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+ return Unpck;
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
@@ -10396,21 +11173,21 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- // X86 has dedicated unpack instructions that can handle specific blend
- // operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Unpck;
+
+ return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
@@ -10418,22 +11195,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane.
- 0, 16, 1, 17, 4, 20, 5, 21,
- // Second 128-bit lane.
- 8, 24, 9, 25, 12, 28, 13, 29}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane.
- 2, 18, 3, 19, 6, 22, 7, 23,
- // Second 128-bit lane.
- 10, 26, 11, 27, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+ if (SDValue Unpck =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+ return Unpck;
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10448,8 +11214,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
+ return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
@@ -10517,6 +11282,60 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
}
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Subtarget->hasAVX512() &&
+ "Cannot lower 512-bit vectors w/o basic ISA!");
+ MVT ExtVT;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Expected a vector of i1 elements");
+ case MVT::v2i1:
+ ExtVT = MVT::v2i64;
+ break;
+ case MVT::v4i1:
+ ExtVT = MVT::v4i32;
+ break;
+ case MVT::v8i1:
+ ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+ break;
+ case MVT::v16i1:
+ ExtVT = MVT::v16i32;
+ break;
+ case MVT::v32i1:
+ ExtVT = MVT::v32i16;
+ break;
+ case MVT::v64i1:
+ ExtVT = MVT::v64i8;
+ break;
+ }
+
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+ V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+ if (V2.isUndef())
+ V2 = DAG.getUNDEF(ExtVT);
+ else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+ V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+ else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+ V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ else
+ V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+}
/// \brief Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
@@ -10533,8 +11352,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
MVT VT = Op.getSimpleValueType();
int NumElements = VT.getVectorNumElements();
SDLoc dl(Op);
+ bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
- assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+ assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+ "Can't lower MMX shuffles");
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -10572,7 +11393,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
- if (VT.getScalarSizeInBits() < 64 &&
+ if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
canWidenShuffleElements(Mask, WidenedMask)) {
MVT NewEltVT = VT.isFloatingPoint()
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
@@ -10640,17 +11461,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
}
// For each vector width, delegate to a specialized lowering routine.
- if (VT.getSizeInBits() == 128)
+ if (VT.is128BitVector())
return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
- if (VT.getSizeInBits() == 256)
+ if (VT.is256BitVector())
return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
- // Force AVX-512 vectors to be scalarized for now.
- // FIXME: Implement AVX-512 support!
- if (VT.getSizeInBits() == 512)
+ if (VT.is512BitVector())
return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ if (Is1BitVector)
+ return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
llvm_unreachable("Unimplemented!");
}
@@ -10661,11 +11482,16 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
unsigned &MaskValue) {
MaskValue = 0;
unsigned NumElems = BuildVector->getNumOperands();
+
// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ // We don't handle the >2 lanes case right now.
unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ if (NumLanes > 2)
+ return false;
+
unsigned NumElemsInLane = NumElems / NumLanes;
- // Blend for v16i16 should be symetric for the both lanes.
+ // Blend for v16i16 should be symmetric for the both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
SDValue EltCond = BuildVector->getOperand(i);
SDValue SndLaneEltCond =
@@ -10673,20 +11499,25 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
int Lane1Cond = -1, Lane2Cond = -1;
if (isa<ConstantSDNode>(EltCond))
- Lane1Cond = !isZero(EltCond);
+ Lane1Cond = !isNullConstant(EltCond);
if (isa<ConstantSDNode>(SndLaneEltCond))
- Lane2Cond = !isZero(SndLaneEltCond);
+ Lane2Cond = !isNullConstant(SndLaneEltCond);
+ unsigned LaneMask = 0;
if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
// Lane1Cond != 0, means we want the first argument.
// Lane1Cond == 0, means we want the second argument.
// The encoding of this argument is 0 for the first argument, 1
// for the second. Therefore, invert the condition.
- MaskValue |= !Lane1Cond << i;
+ LaneMask = !Lane1Cond << i;
else if (Lane1Cond < 0)
- MaskValue |= !Lane2Cond << i;
+ LaneMask = !Lane2Cond << i;
else
return false;
+
+ MaskValue |= LaneMask;
+ if (NumLanes == 2)
+ MaskValue |= LaneMask << NumElemsInLane;
}
return true;
}
@@ -10711,7 +11542,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
SDValue CondElt = CondBV->getOperand(i);
Mask.push_back(
- isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
+ isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+ : -1);
}
return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
}
@@ -10776,9 +11608,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
}
if (VT.getSizeInBits() == 16) {
- unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
// If Idx is 0, it's cheaper to do a move instead of a pextrw.
- if (Idx == 0)
+ if (isNullConstant(Op.getOperand(1)))
return DAG.getNode(
ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
@@ -10801,8 +11632,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
return SDValue();
SDNode *User = *Op.getNode()->use_begin();
if ((User->getOpcode() != ISD::STORE ||
- (isa<ConstantSDNode>(Op.getOperand(1)) &&
- cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+ isNullConstant(Op.getOperand(1))) &&
(User->getOpcode() != ISD::BITCAST ||
User->getValueType(0) != MVT::i32))
return SDValue();
@@ -10900,10 +11730,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
- //if (IdxVal >= NumElems/2)
- // IdxVal -= NumElems/2;
- IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+ // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+ // this can be done with a mask.
+ IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getConstant(IdxVal, dl, MVT::i32));
}
@@ -10918,8 +11749,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
SDValue Vec = Op.getOperand(0);
- unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- if (Idx == 0)
+ if (isNullConstant(Op.getOperand(1)))
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec),
@@ -10951,8 +11781,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
- unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- if (Idx == 0)
+ if (isNullConstant(Op.getOperand(1)))
return Op;
// UNPCKHPD the element to the lowest double word, then movsd.
@@ -11039,7 +11868,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
- unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
+ assert(isPowerOf2_32(NumEltsIn128));
+ // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+ unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getConstant(IdxIn128, dl, MVT::i32));
@@ -11078,8 +11909,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- const Function *F = DAG.getMachineFunction().getFunction();
- bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
@@ -11199,14 +12029,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
// --> load32 addr
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
- !Subtarget->isUnalignedMem32Slow()) {
- SDValue SubVec2 = Vec.getOperand(1);
- if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
- if (Idx2->getZExtValue() == 0) {
- SDValue Ops[] = { SubVec2, SubVec };
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
- return Ld;
+ OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+ auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+ if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ // If needed, look through a bitcast to get to the load.
+ if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
+ SubVec2 = SubVec2.getOperand(0);
+
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
+ bool Fast;
+ unsigned Alignment = FirstLd->getAlignment();
+ unsigned AS = FirstLd->getAddressSpace();
+ const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ OpVT, AS, Alignment, &Fast) && Fast) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+ return Ld;
+ }
}
}
}
@@ -11218,37 +12059,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
- if (OpVT.getVectorElementType() == MVT::i1) {
- if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
- return Op;
- SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
- SDValue Undef = DAG.getUNDEF(OpVT);
- unsigned NumElems = OpVT.getVectorNumElements();
- SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
-
- if (IdxVal == OpVT.getVectorNumElements() / 2) {
- // Zero upper bits of the Vec
- Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
-
- SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
- SubVec, ZeroIdx);
- Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
- return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
- }
- if (IdxVal == 0) {
- SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
- SubVec, ZeroIdx);
- // Zero upper bits of the Vec2
- Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
- Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
- // Zero lower bits of the Vec
- Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
- // Merge them together
- return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
- }
- }
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return Insert1BitVector(Op, DAG);
+
return SDValue();
}
@@ -11363,7 +12176,8 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// load.
if (isGlobalStubReference(OpFlag))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
return Result;
}
@@ -11430,7 +12244,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
// load.
if (isGlobalStubReference(OpFlags))
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
@@ -11587,7 +12402,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
}
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+ false, false, false, 0);
}
// The address of the thread local variable is the add of the thread
@@ -11599,10 +12415,18 @@ SDValue
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ // Cygwin uses emutls.
+ // FIXME: It may be EmulatedTLS-generic also for X86-Android.
+ if (Subtarget->isTargetWindowsCygwin())
+ return LowerToTLSEmulatedModel(GA, DAG);
+
const GlobalValue *GV = GA->getGlobal();
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (Subtarget->isTargetELF()) {
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
@@ -11830,10 +12654,10 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
auto PtrVT = getPointerTy(MF.getDataLayout());
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot,
- MachinePointerInfo::getFixedStack(SSFI),
- false, false, 0);
+ SDValue Chain = DAG.getStore(
+ DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
+ false, 0);
return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
}
@@ -11855,10 +12679,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
MachineMemOperand *MMO;
if (FI) {
int SSFI = FI->getIndex();
- MMO =
- DAG.getMachineFunction()
- .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOLoad, ByteSize, ByteSize);
+ MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, ByteSize, ByteSize);
} else {
MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
StackSlot = StackSlot.getOperand(1);
@@ -11884,16 +12707,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue Ops[] = {
Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
};
- MachineMemOperand *MMO =
- DAG.getMachineFunction()
- .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOStore, SSFISize, SSFISize);
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOStore, SSFISize, SSFISize);
Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
Ops, Op.getValueType(), MMO);
- Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
- MachinePointerInfo::getFixedStack(SSFI),
- false, false, false, 0);
+ Result = DAG.getLoad(
+ Op.getValueType(), DL, Chain, StackSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ false, false, false, 0);
}
return Result;
@@ -11937,16 +12760,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
// Load the 64-bit value into an XMM register.
SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
Op.getOperand(0));
- SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ SDValue CLod0 =
+ DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
- SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ SDValue CLod1 =
+ DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ // TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
@@ -11996,10 +12822,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
// Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
// Handle final rounding.
- EVT DestVT = Op.getValueType();
+ MVT DestVT = Op.getSimpleValueType();
if (DestVT.bitsLT(MVT::f64))
return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
@@ -12025,14 +12852,23 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
+ // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+ // reassociate the two FADDs, and if we do that, the algorithm fails
+ // spectacularly (PR24512).
+ // FIXME: If we ever have some kind of Machine FMF, this should be marked
+ // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+ // there's also the MachineCombiner reassociations happening on Machine IR.
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
SDLoc DL(Op);
SDValue V = Op->getOperand(0);
- EVT VecIntVT = V.getValueType();
+ MVT VecIntVT = V.getSimpleValueType();
bool Is128 = VecIntVT == MVT::v4i32;
- EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
// If we convert to something else than the supported type, e.g., to v4f64,
// abort early.
- if (VecFloatVT != Op->getValueType(0))
+ if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
unsigned NumElts = VecIntVT.getVectorNumElements();
@@ -12070,7 +12906,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
SDValue Low, High;
if (Subtarget.hasSSE41()) {
- EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+ MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
@@ -12108,6 +12944,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+ // TODO: Are there any fast-math-flags to propagate here?
SDValue FHigh =
DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
// return (float4) lo + fhi;
@@ -12137,11 +12974,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
case MVT::v16i8:
case MVT::v16i16:
- if (Subtarget->hasAVX512())
- return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+ assert(Subtarget->hasAVX512());
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
}
- llvm_unreachable(nullptr);
}
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -12150,7 +12986,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (Op.getValueType().isVector())
+ if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG);
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
@@ -12161,6 +12997,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
+
+ if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+ (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+ // Conversions from unsigned i32 to f32/f64 are legal,
+ // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
+ return Op;
+ }
+
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -12193,10 +13037,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// we must be careful to do the computation in x87 extended precision, not
// in SSE. (The generic code can't know it's OK to do this, or how to.)
int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
- MachineMemOperand *MMO =
- DAG.getMachineFunction()
- .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOLoad, 8, 8);
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+ MachineMemOperand::MOLoad, 8, 8);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
@@ -12223,24 +13066,52 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Load the value out, extending it from f32 to f80.
// FIXME: Avoid the extend by constructing the right constant pool?
- SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
- FudgePtr, MachinePointerInfo::getConstantPool(),
- MVT::f32, false, false, false, 4);
+ SDValue Fudge = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+ false, false, false, 4);
// Extend everything to 80 bits to force it to be done on x87.
+ // TODO: Are there any fast-math-flags to propagate here?
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
}
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
std::pair<SDValue,SDValue>
-X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned, bool IsReplace) const {
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, bool IsReplace) const {
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
+ EVT TheVT = Op.getOperand(0).getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+ if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return std::make_pair(SDValue(), SDValue());
+ }
+
+ // If using FIST to compute an unsigned i64, we'll need some fixup
+ // to handle values above the maximum signed i64. A FIST is always
+ // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+ bool UnsignedFixup = !IsSigned &&
+ DstTy == MVT::i64 &&
+ (!Subtarget->is64Bit() ||
+ !isScalarFPTypeInSSEReg(TheVT));
+
+ if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+ // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+ // The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
DstTy = MVT::i64;
}
@@ -12258,42 +13129,87 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
return std::make_pair(SDValue(), SDValue());
- // We lower FP->int64 either into FISTP64 followed by a load from a temporary
- // stack slot, or into the FTOL runtime function.
+ // We lower FP->int64 into FISTP64 followed by a load from a temporary
+ // stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getSizeInBits()/8;
int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
unsigned Opc;
- if (!IsSigned && isIntegerTypeFTOL(DstTy))
- Opc = X86ISD::WIN_FTOL;
- else
- switch (DstTy.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
- case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
- case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
- case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
- }
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
- EVT TheVT = Op.getOperand(0).getValueType();
+ SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+ if (UnsignedFixup) {
+ //
+ // Conversion to unsigned i64 is implemented with a select,
+ // depending on whether the source value fits in the range
+ // of a signed i64. Let Thresh be the FP equivalent of
+ // 0x8000000000000000ULL.
+ //
+ // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Fist-to-mem64 FistSrc
+ // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+ // to XOR'ing the high 32 bits with Adjust.
+ //
+ // Being a power of 2, Thresh is exactly representable in all FP formats.
+ // For X87 we'd like to use the smallest FP type for this constant, but
+ // for DAG type consistency we have to match the FP operand type.
+
+ APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+ LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ bool LosesInfo = false;
+ if (TheVT == MVT::f64)
+ // The rounding mode is irrelevant as the conversion should be exact.
+ Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+ else if (TheVT == MVT::f80)
+ Status = Thresh.convert(APFloat::x87DoubleExtended,
+ APFloat::rmNearestTiesToEven, &LosesInfo);
+
+ assert(Status == APFloat::opOK && !LosesInfo &&
+ "FP conversion should have been exact");
+
+ SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+ SDValue Cmp = DAG.getSetCC(DL,
+ getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getConstant(0x80000000, DL, MVT::i32));
+ SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+ Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT),
+ Value, ThreshVal, ISD::SETLT);
+ Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ }
+
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot,
- MachinePointerInfo::getFixedStack(SSFI),
- false, false, 0);
+ MachinePointerInfo::getFixedStack(MF, SSFI), false,
+ false, 0);
SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
SDValue Ops[] = {
Chain, StackSlot, DAG.getValueType(TheVT)
};
MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOLoad, MemSize, MemSize);
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOLoad, MemSize, MemSize);
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
Chain = Value.getValue(1);
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
@@ -12301,28 +13217,52 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
}
MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOStore, MemSize, MemSize);
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, MemSize, MemSize);
+
+ if (UnsignedFixup) {
+
+ // Insert the FIST, load its result as two i32's,
+ // and XOR the high i32 with Adjust.
+
+ SDValue FistOps[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+ FistOps, DstTy, MMO);
+
+ SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
+ DAG.getConstant(4, DL, PtrVT));
- if (Opc != X86ISD::WIN_FTOL) {
+ SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
+ MachinePointerInfo(),
+ false, false, false, 0);
+ High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+ if (Subtarget->is64Bit()) {
+ // Join High32 and Low32 into a 64-bit result.
+ // (High32 << 32) | Low32
+ Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+ High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+ High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+ DAG.getConstant(32, DL, MVT::i8));
+ SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+ return std::make_pair(Result, SDValue());
+ }
+
+ SDValue ResultOps[] = { Low32, High32 };
+
+ SDValue pair = IsReplace
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+ : DAG.getMergeValues(ResultOps, DL);
+ return std::make_pair(pair, SDValue());
+ } else {
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
Ops, DstTy, MMO);
return std::make_pair(FIST, StackSlot);
- } else {
- SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
- DAG.getVTList(MVT::Other, MVT::Glue),
- Chain, Value);
- SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
- MVT::i32, ftol.getValue(1));
- SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
- MVT::i32, eax.getValue(2));
- SDValue Ops[] = { eax, edx };
- SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
- : DAG.getMergeValues(Ops, DL);
- return std::make_pair(pair, SDValue());
}
}
@@ -12333,7 +13273,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
- if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1)
+ if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
// Optimize vectors in AVX mode:
@@ -12426,6 +13366,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
return SDValue();
}
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+
+ // Shift LSB to MSB and use VPMOVB2M - SKX.
+ unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+ Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
+ // Shift packed bytes not supported natively, bitcast to dword
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+ DAG.getBitcast(ExtVT, In),
+ DAG.getConstant(ShiftInx, DL, ExtVT));
+ ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+ if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+ Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M
+ ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+ InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
+ Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+ }
+
+ // Shift LSB to MSB, extend if necessary and use TESTM.
+ unsigned NumElts = InVT.getVectorNumElements();
+ if (InVT.getSizeInBits() < 512 &&
+ (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
+ !Subtarget->hasVLX())) {
+ assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
+
+ // TESTD/Q should be used (if BW supported we use CVT2MASK above),
+ // so vector should be extended to packed dword/qword.
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+ InVT = ExtVT;
+ ShiftInx = InVT.getScalarSizeInBits() - 1;
+ }
+
+ SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+ DAG.getConstant(ShiftInx, DL, InVT));
+ return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
@@ -12443,42 +13439,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
- // move vector to mask - truncate solution for SKX
- if (VT.getVectorElementType() == MVT::i1) {
- if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
- Subtarget->hasBWI())
- return Op; // legal, will go to VPMOVB2M, VPMOVW2M
- if ((InVT.is256BitVector() || InVT.is128BitVector())
- && InVT.getScalarSizeInBits() <= 16 &&
- Subtarget->hasBWI() && Subtarget->hasVLX())
- return Op; // legal, will go to VPMOVB2M, VPMOVW2M
- if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
- Subtarget->hasDQI())
- return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
- if ((InVT.is256BitVector() || InVT.is128BitVector())
- && InVT.getScalarSizeInBits() >= 32 &&
- Subtarget->hasDQI() && Subtarget->hasVLX())
- return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
- }
- if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
- if (VT.getVectorElementType().getSizeInBits() >=8)
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
-
- assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
- unsigned NumElts = InVT.getVectorNumElements();
- assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
- if (InVT.getSizeInBits() < 512) {
- MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
- In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
- InVT = ExtVT;
- }
-
- SDValue OneV =
- DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
- SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
- return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
- }
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerTruncateVecI1(Op, DAG, Subtarget);
+ // vpmovqb/w/d, vpmovdb/w, vpmovwb
+ if (Subtarget->hasAVX512()) {
+ // word to byte only under BWI
+ if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+ DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+ }
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget->hasInt256()) {
@@ -12583,7 +13554,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
/*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (!FIST.getNode()) return Op;
+ if (!FIST.getNode())
+ return Op;
if (StackSlot.getNode())
// Load the result.
@@ -12600,7 +13572,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
/*IsSigned=*/ false, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
- assert(FIST.getNode() && "Unexpected failure");
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (!FIST.getNode())
+ return Op;
if (StackSlot.getNode())
// Load the result.
@@ -12643,6 +13617,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ bool IsF128 = (VT == MVT::f128);
+
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
@@ -12650,11 +13626,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
MVT LogicVT;
MVT EltVT;
unsigned NumElts;
-
+
if (VT.isVector()) {
LogicVT = VT;
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
+ } else if (IsF128) {
+ // SSE instructions are used for optimized f128 logical operations.
+ LogicVT = MVT::f128;
+ EltVT = VT;
+ NumElts = 1;
} else {
// There are no scalar bitwise logical SSE/AVX instructions, so we
// generate a 16-byte vector constant and logic op even for the scalar case.
@@ -12675,9 +13656,10 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
+ SDValue Mask =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, Alignment);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
@@ -12685,7 +13667,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
- if (VT.isVector())
+ if (VT.isVector() || IsF128)
return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
// For the scalar case extend to a 128-bit vector, perform the logic op,
@@ -12704,6 +13686,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT SrcVT = Op1.getSimpleValueType();
+ bool IsF128 = (VT == MVT::f128);
// If second operand is smaller, extend it first.
if (SrcVT.bitsLT(VT)) {
@@ -12718,13 +13701,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
+ assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+ "Unexpected type in LowerFCOPYSIGN");
const fltSemantics &Sem =
- VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+ VT == MVT::f64 ? APFloat::IEEEdouble :
+ (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
const unsigned SizeInBits = VT.getSizeInBits();
SmallVector<Constant *, 4> CV(
- VT == MVT::f64 ? 2 : 4,
+ VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
// First, clear all bits but the sign bit from the second operand (sign).
@@ -12737,11 +13723,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
// Perform all logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE. This allows load folding of the
// constants into the logic instructions.
- MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
- SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
- Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+ MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+ SDValue Mask1 =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
+ if (!IsF128)
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
@@ -12750,8 +13738,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
APFloat APF = Op0CN->getValueAPF();
// If the magnitude is a positive zero, the sign bit alone is enough.
if (APF.isPosZero())
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
- DAG.getIntPtrConstant(0, dl));
+ return IsF128 ? SignBit :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+ DAG.getIntPtrConstant(0, dl));
APF.clearSign();
CV[0] = ConstantFP::get(*Context, APF);
} else {
@@ -12761,18 +13750,21 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
}
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ SDValue Val =
+ DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ false, false, false, 16);
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (!isa<ConstantFPSDNode>(Op0)) {
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+ if (!IsF128)
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
}
// OR the magnitude value with the sign bit.
Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
- DAG.getIntPtrConstant(0, dl));
+ return IsF128 ? Val :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12859,7 +13851,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
return SDValue();
}
- EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
@@ -12999,14 +13991,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
if (ConstantSDNode *C =
dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
// An add of one will be selected as an INC.
- if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
+ if (C->isOne() && !Subtarget->slowIncDec()) {
Opcode = X86ISD::INC;
NumOperands = 1;
break;
}
// An add of negative one (subtract of one) will be selected as a DEC.
- if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
+ if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
Opcode = X86ISD::DEC;
NumOperands = 1;
break;
@@ -13135,13 +14127,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
SDLoc dl, SelectionDAG &DAG) const {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
- if (C->getAPIntValue() == 0)
- return EmitTest(Op0, X86CC, dl, DAG);
+ if (isNullConstant(Op1))
+ return EmitTest(Op0, X86CC, dl, DAG);
- if (Op0.getValueType() == MVT::i1)
- llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
- }
+ assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+ "Unexpected comparison operation for MVT::i1 operands");
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
@@ -13150,8 +14140,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
// if we're optimizing for size, however, as that'll allow better folding
// of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
- !DAG.getMachineFunction().getFunction()->hasFnAttribute(
- Attribute::MinSize) &&
+ !DAG.getMachineFunction().getFunction()->optForMinSize() &&
!Subtarget->isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -13188,6 +14177,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
DAG.getConstant(8, dl, MVT::i8));
SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
@@ -13261,13 +14253,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
/// original divisions.
-bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
- return NumUsers > 1;
-}
-
-static bool isAllOnes(SDValue V) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
- return C && C->isAllOnesValue();
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+ return 2;
}
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
@@ -13285,8 +14272,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
if (Op1.getOpcode() == ISD::SHL)
std::swap(Op0, Op1);
if (Op0.getOpcode() == ISD::SHL) {
- if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
- if (And00C->getZExtValue() == 1) {
+ if (isOneConstant(Op0.getOperand(0))) {
// If we looked past a truncate, check that it's only truncating away
// known zeros.
unsigned BitWidth = Op0.getValueSizeInBits();
@@ -13423,7 +14409,7 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- assert(Op0.getValueType().getVectorElementType() == MVT::i1 &&
+ assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected type for boolean compare operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
@@ -13467,8 +14453,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
- Op.getValueType().getScalarType() == MVT::i1 &&
+ assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
+ Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
@@ -13515,7 +14501,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
for (unsigned i = 0; i < n; ++i) {
ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
- if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+ if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
return SDValue();
// Avoid underflow.
@@ -13606,13 +14592,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntVSETCC(Op, DAG);
- EVT OpVT = Op1.getValueType();
+ MVT OpVT = Op1.getSimpleValueType();
if (OpVT.getVectorElementType() == MVT::i1)
return LowerBoolVSETCC_AVX512(Op, DAG);
bool MaskResult = (VT.getVectorElementType() == MVT::i1);
if (Subtarget->hasAVX512()) {
- if (Op1.getValueType().is512BitVector() ||
+ if (Op1.getSimpleValueType().is512BitVector() ||
(Subtarget->hasBWI() && Subtarget->hasVLX()) ||
(MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
@@ -13628,6 +14614,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
}
+ // Lower using XOP integer comparisons.
+ if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+ // Translate compare code to XOP PCOM compare mode.
+ unsigned CmpMode = 0;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETULT:
+ case ISD::SETLT: CmpMode = 0x00; break;
+ case ISD::SETULE:
+ case ISD::SETLE: CmpMode = 0x01; break;
+ case ISD::SETUGT:
+ case ISD::SETGT: CmpMode = 0x02; break;
+ case ISD::SETUGE:
+ case ISD::SETGE: CmpMode = 0x03; break;
+ case ISD::SETEQ: CmpMode = 0x04; break;
+ case ISD::SETNE: CmpMode = 0x05; break;
+ }
+
+ // Are we comparing unsigned or signed integers?
+ unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+ ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CmpMode, dl, MVT::i8));
+ }
+
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
@@ -13777,7 +14790,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
VT);
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
@@ -13818,11 +14831,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
- Op1.getOpcode() == ISD::Constant &&
- cast<ConstantSDNode>(Op1)->isNullValue() &&
+ isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
- SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
- if (NewSetCC.getNode()) {
+ if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
return NewSetCC;
@@ -13831,17 +14842,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
- if (Op1.getOpcode() == ISD::Constant &&
- (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
- cast<ConstantSDNode>(Op1)->isNullValue()) &&
+ if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// If the input is a setcc, then reuse the input setcc or use a new one with
// the inverted condition.
if (Op0.getOpcode() == X86ISD::SETCC) {
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
- bool Invert = (CC == ISD::SETNE) ^
- cast<ConstantSDNode>(Op1)->isNullValue();
+ bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
if (!Invert)
return Op0;
@@ -13854,8 +14862,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SetCC;
}
}
- if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
- (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+ if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
@@ -13876,6 +14883,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SetCC;
}
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Carry = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(3);
+ SDLoc DL(Op);
+
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+ assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+ SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+ SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+ return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
+ DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+}
+
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getNode()->getOpcode();
@@ -13918,7 +14942,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
SDLoc DL(Op);
- EVT VT = Op1.getValueType();
+ MVT VT = Op1.getSimpleValueType();
SDValue CC;
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
@@ -13927,7 +14951,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (Cond.getOpcode() == ISD::SETCC &&
((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
(Subtarget->hasSSE1() && VT == MVT::f32)) &&
- VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+ VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
int SSECC = translateX86FSETCC(
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
@@ -13961,12 +14985,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Convert to vectors, do a VSELECT, and convert back to scalar.
// All of the conversions should be optimized away.
- EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
- EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
@@ -13980,26 +15004,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
- if (VT.isVector() && VT.getScalarType() == MVT::i1) {
- SDValue Op1Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
- Op1Scalar = ConvertI1VectorToInterger(Op1, DAG);
- else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
- Op1Scalar = Op1.getOperand(0);
- SDValue Op2Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
- Op2Scalar = ConvertI1VectorToInterger(Op2, DAG);
- else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
- Op2Scalar = Op2.getOperand(0);
- if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
- Op1Scalar.getValueType(),
- Cond, Op1Scalar, Op2Scalar);
- if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, newSelect);
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
- DAG.getIntPtrConstant(0, DL));
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+ SDValue Op1Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+ Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+ else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+ Op1Scalar = Op1.getOperand(0);
+ SDValue Op2Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+ Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+ else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+ Op2Scalar = Op2.getOperand(0);
+ if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+ Op1Scalar.getValueType(),
+ Cond, Op1Scalar, Op2Scalar);
+ if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, newSelect);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+ DAG.getIntPtrConstant(0, DL));
}
}
@@ -14026,22 +15050,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
- isZero(Cond.getOperand(1).getOperand(1))) {
+ isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
- if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+ if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
- SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+ SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
SDValue CmpOp0 = Cmp.getOperand(0);
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
- if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
- if (YC->isNullValue() &&
- (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+ if (isNullConstant(Y) &&
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
DAG.getConstant(0, DL,
@@ -14061,11 +15084,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
- if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
- ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
- if (!N2C || !N2C->isNullValue())
+ if (!isNullConstant(Op2))
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
}
@@ -14073,11 +15095,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Look past (and (setcc_carry (cmp ...)), 1).
if (Cond.getOpcode() == ISD::AND &&
- Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
- if (C && C->getAPIntValue() == 1)
- Cond = Cond.getOperand(0);
- }
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
@@ -14136,15 +15156,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (addTest) {
- // Look pass the truncate if the high bits are known zero.
+ // Look past the truncate if the high bits are known zero.
if (isTruncWithZeroHighBitsInput(Cond, DAG))
- Cond = Cond.getOperand(0);
+ Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
- if (NewSetCC.getNode()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
addTest = false;
@@ -14166,11 +15185,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
- (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+ (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ (isNullConstant(Op1) || isNullConstant(Op2))) {
SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
DAG.getConstant(X86::COND_B, DL, MVT::i8),
Cond);
- if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+ if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
}
@@ -14256,8 +15276,8 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
MVT InVT = In.getSimpleValueType();
assert(VT.getSizeInBits() == InVT.getSizeInBits());
- MVT InSVT = InVT.getScalarType();
- assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits());
+ MVT InSVT = InVT.getVectorElementType();
+ assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
return SDValue();
@@ -14276,7 +15296,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
// As SRAI is only available on i16/i32 types, we expand only up to i32
// and handle i64 separately.
- while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
+ while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
@@ -14286,7 +15306,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
SDValue SignExt = Curr;
if (CurrVT != InVT) {
unsigned SignExtShift =
- CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
+ CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
DAG.getConstant(SignExtShift, dl, MVT::i8));
}
@@ -14346,7 +15366,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
- MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements()/2);
OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
@@ -14470,7 +15490,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// memory. In practice, we ''widen'' MemVT.
EVT WideVecVT =
EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- loadRegZize / MemVT.getScalarType().getSizeInBits());
+ loadRegZize / MemVT.getScalarSizeInBits());
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type");
@@ -14518,29 +15538,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
return Sext;
}
- // Otherwise we'll shuffle the small elements in the high bits of the
- // larger type and perform an arithmetic shift. If the shift is not legal
- // it's better to scalarize.
- assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
- "We can't implement a sext load without an arithmetic right shift!");
-
- // Redistribute the loaded elements into the different locations.
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
-
- SDValue Shuff = DAG.getVectorShuffle(
- WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
-
- Shuff = DAG.getBitcast(RegVT, Shuff);
-
- // Build the arithmetic shift.
- unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
- MemVT.getVectorElementType().getSizeInBits();
- Shuff =
- DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
- DAG.getConstant(Amt, dl, RegVT));
+ // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+ // lanes.
+ assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+ "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+ SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Shuff;
}
@@ -14577,11 +15580,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
static bool isXor1OfSetCC(SDValue Op) {
if (Op.getOpcode() != ISD::XOR)
return false;
- ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
- if (N1C && N1C->getAPIntValue() == 1) {
+ if (isOneConstant(Op.getOperand(1)))
return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
- Op.getOperand(0).hasOneUse();
- }
+ Op.getOperand(0).hasOneUse();
return false;
}
@@ -14597,8 +15598,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (Cond.getOpcode() == ISD::SETCC) {
// Check for setcc([su]{add,sub,mul}o == 0).
if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
- isa<ConstantSDNode>(Cond.getOperand(1)) &&
- cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+ isNullConstant(Cond.getOperand(1)) &&
Cond.getOperand(0).getResNo() == 1 &&
(Cond.getOperand(0).getOpcode() == ISD::SADDO ||
Cond.getOperand(0).getOpcode() == ISD::UADDO ||
@@ -14625,11 +15625,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
// Look pass (and (setcc_carry (cmp ...)), 1).
if (Cond.getOpcode() == ISD::AND &&
- Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
- if (C && C->getAPIntValue() == 1)
- Cond = Cond.getOperand(0);
- }
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
@@ -14673,16 +15671,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
switch (CondOpcode) {
case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
case ISD::SADDO:
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
- if (C->isOne()) {
+ if (isOneConstant(RHS)) {
X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
break;
}
X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
case ISD::SSUBO:
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
- if (C->isOne()) {
+ if (isOneConstant(RHS)) {
X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
break;
}
@@ -14844,8 +15840,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
- if (NewSetCC.getNode()) {
+ if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
addTest = false;
@@ -14877,54 +15872,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SplitStack;
SDLoc dl(Op);
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ EVT VT = Node->getValueType(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+ bool Is64Bit = Subtarget->is64Bit();
+ MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+ SDValue Result;
if (!Lower) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDNode* Node = Op.getNode();
-
unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
- " not tell us which reg is the stack pointer!");
+ " not tell us which reg is the stack pointer!");
EVT VT = Node->getValueType(0);
- SDValue Tmp1 = SDValue(Node, 0);
- SDValue Tmp2 = SDValue(Node, 1);
SDValue Tmp3 = Node->getOperand(2);
- SDValue Chain = Tmp1.getOperand(0);
-
- // Chain the dynamic stack allocation so that it doesn't modify the stack
- // pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
- SDLoc(Node));
- SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
- Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
- Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
- DAG.getConstant(-(uint64_t)Align, dl, VT));
- Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
-
- Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
- DAG.getIntPtrConstant(0, dl, true), SDValue(),
- SDLoc(Node));
-
- SDValue Ops[2] = { Tmp1, Tmp2 };
- return DAG.getMergeValues(Ops, dl);
- }
-
- // Get the inputs.
- SDValue Chain = Op.getOperand(0);
- SDValue Size = Op.getOperand(1);
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- EVT VT = Op.getNode()->getValueType(0);
-
- bool Is64Bit = Subtarget->is64Bit();
- MVT SPTy = getPointerTy(DAG.getDataLayout());
-
- if (SplitStack) {
+ Result = DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+ } else if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
@@ -14942,10 +15923,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
- SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
- SDValue Ops1[2] = { Value, Chain };
- return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
@@ -14967,9 +15946,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
}
- SDValue Ops1[2] = { SP, Chain };
- return DAG.getMergeValues(Ops1, dl);
+ Result = SP;
}
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ SDValue Ops[2] = {Result, Chain};
+ return DAG.getMergeValues(Ops, dl);
}
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -14980,7 +15964,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SDLoc DL(Op);
- if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+ if (!Subtarget->is64Bit() ||
+ Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -15019,10 +16004,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MemOps.push_back(Store);
// Store ptr to reg_save_area.
- FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+ FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+ Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
- Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
- MachinePointerInfo(SV, 16), false, false, 0);
+ Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
+ SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
MemOps.push_back(Store);
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
@@ -15030,10 +16016,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->is64Bit() &&
"LowerVAARG only handles 64-bit va_arg!");
- assert((Subtarget->isTargetLinux() ||
- Subtarget->isTargetDarwin()) &&
- "Unhandled target in LowerVAARG");
assert(Op.getNode()->getNumOperands() == 4);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ // The Win64 ABI uses char* instead of a structure.
+ return DAG.expandVAArg(Op.getNode());
+
SDValue Chain = Op.getOperand(0);
SDValue SrcPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -15061,8 +16050,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!Subtarget->useSoftFloat() &&
- !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
- Attribute::NoImplicitFloat)) &&
+ !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
@@ -15091,8 +16079,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+ // where a va_list is still an i8*.
assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+ if (Subtarget->isCallingConvWin64(
+ DAG.getMachineFunction().getFunction()->getCallingConv()))
+ // Probably a Win64 va_copy.
+ return DAG.expandVACopy(Op.getNode());
+
SDValue Chain = Op.getOperand(0);
SDValue DstPtr = Op.getOperand(1);
SDValue SrcPtr = Op.getOperand(2);
@@ -15230,72 +16224,126 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
// The return type has to be a 128-bit type with the same element
// type as the input type.
MVT EltVT = VT.getVectorElementType();
- EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+ MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
ShAmt = DAG.getBitcast(ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
-/// \brief Return (and \p Op, \p Mask) for compare instructions or
-/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
-/// necessary casting for \p Mask when lowering masking intrinsics.
-static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
- SDValue PreservedSrc,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
- EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
- MVT::i1, VT.getVectorNumElements());
- EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- Mask.getValueType().getSizeInBits());
- SDLoc dl(Op);
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
- assert(MaskVT.isSimple() && "invalid mask type");
+ if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+ // Mask should be extended
+ Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+ MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+ }
- if (isAllOnes(Mask))
- return Op;
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+ if (MaskVT == MVT::v64i1) {
+ assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ } else {
+ // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+ // and bitcast.
+ MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+ return DAG.getBitcast(MaskVT,
+ DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+ }
+
+ } else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
- SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getBitcast(BitcastVT, Mask),
- DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+}
- switch (Op.getOpcode()) {
- default: break;
- case X86ISD::PCMPEQM:
- case X86ISD::PCMPGTM:
- case X86ISD::CMPM:
- case X86ISD::CMPMU:
- return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
- }
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
- PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ unsigned OpcodeSelect = ISD::VSELECT;
+ SDLoc dl(Op);
+
+ if (isAllOnesConstant(Mask))
+ return Op;
+
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ switch (Op.getOpcode()) {
+ default: break;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case X86ISD::VFPCLASS:
+ case X86ISD::VFPCLASSS:
+ return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
+ // We can't use ISD::VSELECT here because it is not always "Legal"
+ // for the destination type. For example vpmovqb require only AVX512
+ // and vselect that can operate on byte element type require BWI
+ OpcodeSelect = X86ISD::SELECT;
+ break;
+ }
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
}
/// \brief Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is comming as MVT::i8 and it should be truncated
+/// The mask is coming as MVT::i8 and it should be truncated
/// to MVT::i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
-/// "X86select" instead of "vselect". We just can't create the "vselect" node for
-/// a scalar instruction.
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (isAllOnes(Mask))
- return Op;
+ if (isAllOnesConstant(Mask))
+ return Op;
- EVT VT = Op.getValueType();
- SDLoc dl(Op);
- // The mask should be of type MVT::i1
- SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
- PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+ if (Op.getOpcode() == X86ISD::FSETCC)
+ return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+ if (Op.getOpcode() == X86ISD::VFPCLASS ||
+ Op.getOpcode() == X86ISD::VFPCLASSS)
+ return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
static int getSEHRegistrationNodeSize(const Function *Fn) {
@@ -15309,15 +16357,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) {
case EHPersonality::MSVC_CXX: return 16;
default: break;
}
- report_fatal_error("can only recover FP for MSVC EH personality functions");
+ report_fatal_error(
+ "can only recover FP for 32-bit MSVC EH personality functions");
}
-/// When the 32-bit MSVC runtime transfers control to us, either to an outlined
+/// When the MSVC runtime transfers control to us, either to an outlined
/// function or when returning to a parent frame after catching an exception, we
/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
/// Here's the math:
/// RegNodeBase = EntryEBP - RegNodeSize
-/// ParentFP = RegNodeBase - RegNodeFrameOffset
+/// ParentFP = RegNodeBase - ParentFrameOffset
/// Subtracting RegNodeSize takes us to the offset of the registration node, and
/// subtracting the offset (negative on x86) takes us back to the parent FP.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
@@ -15334,29 +16383,35 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
if (!Fn->hasPersonalityFn())
return EntryEBP;
- int RegNodeSize = getSEHRegistrationNodeSize(Fn);
-
// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
- // registration.
+ // registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
GlobalValue::getRealLinkageName(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
- SDValue RegNodeFrameOffset =
+ SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+ // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+ // prologue to RBP in the parent function.
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.is64Bit())
+ return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+ int RegNodeSize = getSEHRegistrationNodeSize(Fn);
// RegNodeBase = EntryEBP - RegNodeSize
- // ParentFP = RegNodeBase - RegNodeFrameOffset
+ // ParentFP = RegNodeBase - ParentFrameOffset
SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
DAG.getConstant(RegNodeSize, dl, PtrVT));
- return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset);
+ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
}
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
@@ -15365,6 +16420,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
case INTR_TYPE_2OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
+ case INTR_TYPE_2OP_IMM8:
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
case INTR_TYPE_3OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
@@ -15376,28 +16434,53 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue RoundingMode;
+ // We allways add rounding mode to the Node.
+ // If the rounding mode is not specified, we add the
+ // "current direction" mode.
if (Op.getNumOperands() == 4)
- RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ RoundingMode =
+ DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
else
RoundingMode = Op.getOperand(4);
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
- if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION)
+ if (IntrWithRoundingModeOpcode != 0)
+ if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
dl, Op.getValueType(), Src, RoundingMode),
Mask, PassThru, Subtarget, DAG);
- }
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
RoundingMode),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
- SDValue Passthru = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RM Opcode is specified and
+ // - RM is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
- Mask, Passthru, Subtarget, DAG);
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_SCALAR_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_RM: {
SDValue Src1 = Op.getOperand(1);
@@ -15405,7 +16488,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
// There are 2 kinds of intrinsics in this group:
- // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands
+ // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
// (2) With rounding mode and sae - 7 operands.
if (Op.getNumOperands() == 6) {
SDValue Sae = Op.getOperand(5);
@@ -15421,11 +16504,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK: {
+ case INTR_TYPE_2OP_MASK:
+ case INTR_TYPE_2OP_IMM8_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -15440,8 +16528,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask, PassThru, Subtarget, DAG);
}
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1,Src2),
+ // TODO: Intrinsics should have fast-math-flags to propagate.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK_RM: {
@@ -15449,7 +16537,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // We specify 2 possible modes for intrinsics, with/without rounding modes.
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
// First, we check if the intrinsic have rounding mode (6 operands),
// if not, we set rounding mode to "current".
SDValue Rnd;
@@ -15461,12 +16550,56 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Src1, Src2, Rnd),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_MASK: {
+ case INTR_TYPE_3OP_SCALAR_MASK_RM: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
+
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2, Src3, Sae),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Imm = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
+ // First, we check if the intrinsic have rounding mode (7 operands),
+ // if not, we set rounding mode to "current".
+ SDValue Rnd;
+ if (Op.getNumOperands() == 7)
+ Rnd = Op.getOperand(6);
+ else
+ Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Imm, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_IMM8_MASK:
+ case INTR_TYPE_3OP_MASK:
+ case INSERT_SUBVEC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+ Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+ else if (IntrData->Type == INSERT_SUBVEC) {
+ // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+ assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+ unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+ Imm *= Src2.getSimpleValueType().getVectorNumElements();
+ Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+ }
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -15486,7 +16619,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask, PassThru, Subtarget, DAG);
}
case VPERM_3OP_MASKZ:
- case VPERM_3OP_MASK:
+ case VPERM_3OP_MASK:{
+ // Src2 is the PassThru
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = SDValue();
+
+ // set PassThru element
+ if (IntrData->Type == VPERM_3OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+ else
+ PassThru = DAG.getBitcast(VT, Src2);
+
+ // Swap Src1 and Src2 in the node creation
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src2, Src1, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
case FMA_OP_MASK3:
case FMA_OP_MASKZ:
case FMA_OP_MASK: {
@@ -15494,11 +16647,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
SDValue PassThru = SDValue();
// set PassThru element
- if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ)
+ if (IntrData->Type == FMA_OP_MASKZ)
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
else if (IntrData->Type == FMA_OP_MASK3)
PassThru = Src3;
@@ -15523,6 +16676,50 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case TERLOG_OP_MASK:
+ case TERLOG_OP_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+ SDValue Mask = Op.getOperand(5);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = Src1;
+ // Set PassThru element.
+ if (IntrData->Type == TERLOG_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Src4),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FPCLASS: {
+ // FPclass intrinsics with mask
+ SDValue Src1 = Op.getOperand(1);
+ MVT VT = Src1.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+ SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case FPCLASSS: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+ }
case CMP_MASK:
case CMP_MASK_CC: {
// Comparison intrinsics with masks.
@@ -15534,12 +16731,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
// (v2i1 (and (PCMPEQM %a, %b),
// (extract_subvector
// (v8i1 (bitcast %mask)), 0))), 0))))
- EVT VT = Op.getOperand(1).getValueType();
- EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- VT.getVectorNumElements());
+ MVT VT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
- EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- Mask.getValueType().getSizeInBits());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
SDValue Cmp;
if (IntrData->Type == CMP_MASK_CC) {
SDValue CC = Op.getOperand(3);
@@ -15573,6 +16769,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
+ case CMP_MASK_SCALAR_CC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue Mask = Op.getOperand(4);
+
+ SDValue Cmp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MVT::i1),
+ Subtarget, DAG);
+
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+ DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+ DAG.getValueType(MVT::i1));
+ }
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
@@ -15584,6 +16806,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
DAG.getConstant(X86CC, dl, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ case COMI_RM: { // Comparison intrinsics with Sae
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDValue CC = Op.getOperand(3);
+ SDValue Sae = Op.getOperand(4);
+ auto ComiType = TranslateX86ConstCondToX86CC(CC);
+ // choose between ordered and unordered (comi/ucomi)
+ unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+ SDValue Cond;
+ if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+ else
+ Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
@@ -15598,27 +16838,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
- if (isAllOnes(Mask)) // return data as is
+ if (isAllOnesConstant(Mask)) // return data as is
return Op.getOperand(1);
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
DataToCompress),
Mask, PassThru, Subtarget, DAG);
}
+ case BROADCASTM: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ Mask = DAG.getBitcast(MaskVT, Mask);
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+ }
case BLEND: {
SDValue Mask = Op.getOperand(3);
- EVT VT = Op.getValueType();
- EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- VT.getVectorNumElements());
- EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- Mask.getValueType().getSizeInBits());
- SDLoc dl(Op);
- SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getBitcast(BitcastVT, Mask),
- DAG.getIntPtrConstant(0, dl));
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
Op.getOperand(2));
}
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
+ case CONVERT_TO_MASK: {
+ MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ Op.getOperand(1));
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CvtMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case CONVERT_MASK_TO_VEC: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+ }
+ case BRCST_SUBVEC_TO_VEC: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Passthru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ EVT resVT = Passthru.getValueType();
+ SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+ DAG.getUNDEF(resVT), Src,
+ DAG.getIntPtrConstant(0, dl));
+ SDValue immVal;
+ if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+ immVal = DAG.getConstant(0x44, dl, MVT::i8);
+ else
+ immVal = DAG.getConstant(0, dl, MVT::i8);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ subVec, subVec, immVal),
+ Mask, Passthru, Subtarget, DAG);
+ }
default:
break;
}
@@ -15832,23 +17120,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- if (!C)
- llvm_unreachable("Invalid scale type");
- unsigned ScaleVal = C->getZExtValue();
- if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
- llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+ auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else {
- EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- Mask.getValueType().getSizeInBits());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
@@ -15860,7 +17142,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
if (Src.getOpcode() == ISD::UNDEF)
- Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -15871,25 +17153,19 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- if (!C)
- llvm_unreachable("Invalid scale type");
- unsigned ScaleVal = C->getZExtValue();
- if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8)
- llvm_unreachable("Valid scale values are 1, 2, 4, 8");
-
+ auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
+ MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else {
- EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- Mask.getValueType().getSizeInBits());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
@@ -15907,12 +17183,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Mask, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- assert(C && "Invalid scale type");
+ auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- EVT MaskVT =
+ MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
@@ -16034,64 +17309,59 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getMergeValues(Results, DL);
}
-static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
- const Function *Fn = MF.getFunction();
- SDLoc dl(Op);
SDValue Chain = Op.getOperand(0);
+ SDValue RegNode = Op.getOperand(2);
+ WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+ if (!EHInfo)
+ report_fatal_error("EH registrations only live in functions using WinEH");
+
+ // Cast the operand to an alloca, and remember the frame index.
+ auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+ if (!FINode)
+ report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+ EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+ // Return the chain operand without making any DAG nodes.
+ return Chain;
+}
- assert(Subtarget->getFrameLowering()->hasFP(MF) &&
- "using llvm.x86.seh.restoreframe requires a frame pointer");
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT VT = TLI.getPointerTy(DAG.getDataLayout());
-
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
- unsigned FrameReg =
- RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
- unsigned SPReg = RegInfo->getStackRegister();
- unsigned SlotSize = RegInfo->getSlotSize();
+/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
+/// return truncate Store/MaskedStore Node
+static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
+ SelectionDAG &DAG,
+ MVT ElementType) {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToTruncate = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
- // Get incoming EBP.
- SDValue IncomingEBP =
- DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+ MVT VT = DataToTruncate.getSimpleValueType();
+ MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
- // SP is saved in the first field of every registration node, so load
- // [EBP-RegNodeSize] into SP.
- int RegNodeSize = getSEHRegistrationNodeSize(Fn);
- SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
- DAG.getConstant(-RegNodeSize, dl, VT));
- SDValue NewSP =
- DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
- false, VT.getScalarSizeInBits() / 8);
- Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
-
- if (!RegInfo->needsStackRealignment(MF)) {
- // Adjust EBP to point back to the original frame position.
- SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
- Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
- } else {
- assert(RegInfo->hasBasePointer(MF) &&
- "functions with Win32 EH must use frame or base pointer register");
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
+ MachinePointerInfo(), SVT, false, false,
+ SVT.getScalarSizeInBits()/8);
- // Reload the base pointer (ESI) with the adjusted incoming EBP.
- SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
- Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
- // Reload the spilled EBP value, now that the stack and base pointers are
- // set up.
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- X86FI->setHasSEHFramePtrSave(true);
- int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
- X86FI->setSEHFramePtrSaveIndex(FI);
- SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
- MachinePointerInfo(), false, false, false,
- VT.getScalarSizeInBits() / 8);
- Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
- }
+ MachineMemOperand *MMO = DAG.getMachineFunction().
+ getMachineMemOperand(MachinePointerInfo(),
+ MachineMemOperand::MOStore, SVT.getStoreSize(),
+ SVT.getScalarSizeInBits()/8);
- return Chain;
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
+ VMask, SVT, MMO, true);
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
@@ -16100,16 +17370,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
- if (IntNo == llvm::Intrinsic::x86_seh_restoreframe)
- return LowerSEHRESTOREFRAME(Op, Subtarget, DAG);
+ if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+ return MarkEHRegistrationNode(Op, DAG);
return SDValue();
}
SDLoc dl(Op);
switch(IntrData->Type) {
- default:
- llvm_unreachable("Unknown Intrinsic Type");
- break;
+ default: llvm_unreachable("Unknown Intrinsic Type");
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
@@ -16214,8 +17482,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
- EVT VT = DataToCompress.getValueType();
- if (isAllOnes(Mask)) // return just a store
+ MVT VT = DataToCompress.getSimpleValueType();
+ if (isAllOnesConstant(Mask)) // return just a store
return DAG.getStore(Chain, dl, DataToCompress, Addr,
MachinePointerInfo(), false, false,
VT.getScalarSizeInBits()/8);
@@ -16227,15 +17495,21 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
MachinePointerInfo(), false, false,
VT.getScalarSizeInBits()/8);
}
+ case TRUNCATE_TO_MEM_VI8:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
+ case TRUNCATE_TO_MEM_VI16:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
+ case TRUNCATE_TO_MEM_VI32:
+ return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
case EXPAND_FROM_MEM: {
SDLoc dl(Op);
SDValue Mask = Op.getOperand(4);
SDValue PassThru = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getSimpleValueType();
- if (isAllOnes(Mask)) // return just a load
+ if (isAllOnesConstant(Mask)) // return just a load
return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
false, VT.getScalarSizeInBits()/8);
@@ -16359,6 +17633,21 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
+unsigned X86TargetLowering::getExceptionPointerRegister(
+ const Constant *PersonalityFn) const {
+ if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+ return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+ return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+ const Constant *PersonalityFn) const {
+ // Funclet personalities don't use selectors (the runtime does the selection).
+ assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+ return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
@@ -16497,9 +17786,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
- if (Attrs.hasAttribute(Idx, Attribute::InReg))
+ if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+ auto &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
- InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+ InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+ }
if (InRegCount > 2) {
report_fatal_error("Nest register in use - reduce number of inreg"
@@ -16588,8 +17879,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
- MachineMemOperand::MOStore, 2, 2);
+ MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+ MachineMemOperand::MOStore, 2, 2);
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
@@ -16623,12 +17914,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
-static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+// to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- EVT OpVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+ // Extend to 512 bit vector.
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unsupported value type for operation");
+
+ MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+ SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+ DAG.getUNDEF(NewVT),
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ "Unsupported element type");
+
+ if (16 < NumElems) {
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+ MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
+ Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ }
+
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+ assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+ "Unsupported value type for operation");
+
+ // Use native supported vector instruction vplzcntd.
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+ SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+ SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
+ if (VT.isVector() && Subtarget->hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
Op = Op.getOperand(0);
if (VT == MVT::i8) {
// Zero extend to i32 since there is not an i8 bsr.
@@ -16658,7 +18012,8 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
@@ -16686,13 +18041,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- unsigned NumBits = VT.getSizeInBits();
+ unsigned NumBits = VT.getScalarSizeInBits();
SDLoc dl(Op);
- Op = Op.getOperand(0);
+
+ if (VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ SDValue N0 = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+
+ // lsb(x) = (x & -x)
+ SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+ DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+ // cttz_undef(x) = (width - 1) - ctlz(lsb)
+ if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+ TLI.isOperationLegal(ISD::CTLZ, VT)) {
+ SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+ return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+ DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+ }
+
+ // cttz(x) = ctpop(lsb - 1)
+ SDValue One = DAG.getConstant(1, dl, VT);
+ return DAG.getNode(ISD::CTPOP, dl, VT,
+ DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+ }
+
+ assert(Op.getOpcode() == ISD::CTTZ &&
+ "Only scalar CTTZ requires custom lowering");
// Issue a bsf (scan bits forward) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
- Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
// If src is zero (i.e. bsf sets ZF), returns NumBits.
SDValue Ops[] = {
@@ -16753,6 +18134,13 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
return Lower256IntArith(Op, DAG);
}
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -16885,7 +18273,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SDValue AhiBlo = Ahi;
SDValue AloBhi = Bhi;
// Bit cast to 32-bit vectors for MULUDQ
- EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+ MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
(VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
A = DAG.getBitcast(MulVT, A);
B = DAG.getBitcast(MulVT, B);
@@ -16962,7 +18350,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
- EVT VT = Op0.getValueType();
+ MVT VT = Op0.getSimpleValueType();
SDLoc dl(Op);
assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
@@ -17034,7 +18422,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getMergeValues(Ops, dl);
}
-// Return true if the requred (according to Opcode) shift-imm form is natively
+// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
unsigned Opcode) {
@@ -17054,14 +18442,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
}
// The shift amount is a variable, but it is the same for all vector lanes.
-// These instrcutions are defined together with shift-immediate.
+// These instructions are defined together with shift-immediate.
static
bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
unsigned Opcode) {
return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
}
-// Return true if the requred (according to Opcode) variable-shift form is
+// Return true if the required (according to Opcode) variable-shift form is
// natively supported by the Subtarget
static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
unsigned Opcode) {
@@ -17133,27 +18521,37 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// i64 SRA needs to be performed as partial shifts.
if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
- Op.getOpcode() == ISD::SRA)
+ Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
return ArithmeticShiftRight64(ShiftAmt);
- if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ if (VT == MVT::v16i8 ||
+ (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+ VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Op.getOpcode() == ISD::SHL) {
- // Simple i8 add case
- if (ShiftAmt == 1)
- return DAG.getNode(ISD::ADD, dl, VT, R, R);
+ // Simple i8 add case
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+ // ashr(R, 7) === cmp_slt(R, 0)
+ if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+
+ // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+ if (VT == MVT::v16i8 && Subtarget->hasXOP())
+ return SDValue();
+ if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
R, ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(
- NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+ DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -17161,24 +18559,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
R, ShiftAmt, DAG);
SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(
- NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
}
if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
-
- // R s>> a === ((R u>> a) ^ m) - m
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(NumElts,
- DAG.getConstant(128 >> ShiftAmt, dl,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
+
+ SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
@@ -17189,35 +18577,51 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget->is64Bit() &&
- (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
- Amt.getOpcode() == ISD::BITCAST &&
- Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
+ (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+
+ // Peek through any splat that was introduced for i64 shift vectorization.
+ int SplatIndex = -1;
+ if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+ if (SVN->isSplat()) {
+ SplatIndex = SVN->getSplatIndex();
+ Amt = Amt.getOperand(0);
+ assert(SplatIndex < (int)VT.getVectorNumElements() &&
+ "Splat shuffle referencing second operand");
+ }
+
+ if (Amt.getOpcode() != ISD::BITCAST ||
+ Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
Amt = Amt.getOperand(0);
unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
VT.getVectorNumElements();
unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
uint64_t ShiftAmt = 0;
+ unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
for (unsigned i = 0; i != Ratio; ++i) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
if (!C)
return SDValue();
// 6 == Log2(64)
ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
}
- // Check remaining shift amounts.
- for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
- uint64_t ShAmt = 0;
- for (unsigned j = 0; j != Ratio; ++j) {
- ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
- if (!C)
+
+ // Check remaining shift amounts (if not a splat).
+ if (SplatIndex < 0) {
+ for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+ uint64_t ShAmt = 0;
+ for (unsigned j = 0; j != Ratio; ++j) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+ if (!C)
+ return SDValue();
+ // 6 == Log2(64)
+ ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+ }
+ if (ShAmt != ShiftAmt)
return SDValue();
- // 6 == Log2(64)
- ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
}
- if (ShAmt != ShiftAmt)
- return SDValue();
}
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
@@ -17245,7 +18649,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
SDValue BaseShAmt;
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
// Check if this build_vector node is doing a splat.
@@ -17262,7 +18666,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
SDValue InVec = Amt.getOperand(0);
if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
- assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
+ assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
"Unexpected shuffle index found!");
BaseShAmt = InVec.getOperand(SplatIdx);
} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
@@ -17327,11 +18731,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return V;
if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
- return V;
+ return V;
if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
return Op;
+ // XOP has 128-bit variable logical/arithmetic shifts.
+ // +ve/-ve Amt = shift left/right.
+ if (Subtarget->hasXOP() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+ }
+ if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+ if (Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+ }
+
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
// shifts per-lane and then shuffle the partial results back together.
if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
@@ -17343,6 +18762,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
}
+ // i64 vector arithmetic shift can be emulated with the transform:
+ // M = lshr(SIGN_BIT, Amt)
+ // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+ if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+ Op.getOpcode() == ISD::SRA) {
+ SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+ SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+ R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
@@ -17351,9 +18783,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
(Subtarget->hasInt256() && VT == MVT::v16i16)) &&
ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
SmallVector<SDValue, 8> Elts;
- EVT SVT = VT.getScalarType();
+ MVT SVT = VT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();
- const APInt &One = APInt(SVTBits, 1);
+ APInt One(SVTBits, 1);
unsigned NumElems = VT.getVectorNumElements();
for (unsigned i=0; i !=NumElems; ++i) {
@@ -17364,7 +18796,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
ConstantSDNode *ND = cast<ConstantSDNode>(Op);
- const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+ APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
uint64_t ShAmt = C.getZExtValue();
if (ShAmt >= SVTBits) {
Elts.push_back(DAG.getUNDEF(SVT));
@@ -17443,7 +18875,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
isa<ConstantSDNode>(Amt2)) {
// Replace this node with two shifts followed by a MOVSS/MOVSD.
- EVT CastVT = MVT::v4i32;
+ MVT CastVT = MVT::v4i32;
SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
@@ -17507,7 +18939,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
- if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
@@ -17627,7 +19060,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
}
- if (Subtarget->hasInt256() && VT == MVT::v16i16) {
+ if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
@@ -17710,7 +19143,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
if (VT.is256BitVector()) {
unsigned NumElems = VT.getVectorNumElements();
MVT EltVT = VT.getVectorElementType();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
@@ -17743,6 +19176,40 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return SDValue();
}
+static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+ assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+ assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+ // XOP has 128-bit vector variable + immediate rotates.
+ // +ve/-ve Amt = rotate left/right.
+
+ // Split 256-bit integers.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+ // Attempt to rotate by immediate.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+ uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+ assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+ return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Use general rotate by variable (per-element).
+ return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
@@ -17759,8 +19226,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
case ISD::SADDO:
// A subtract of one will be selected as a INC. Note that INC doesn't
// set CF, so we can't do this for UADDO.
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
- if (C->isOne()) {
+ if (isOneConstant(RHS)) {
BaseOp = X86ISD::INC;
Cond = X86::COND_O;
break;
@@ -17775,8 +19241,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
case ISD::SSUBO:
// A subtract of one will be selected as a DEC. Note that DEC doesn't
// set CF, so we can't do this for USUBO.
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
- if (C->isOne()) {
+ if (isOneConstant(RHS)) {
BaseOp = X86ISD::DEC;
Cond = X86::COND_O;
break;
@@ -17827,7 +19292,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
-bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
@@ -17844,21 +19309,23 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// Note: this turns large loads into lock cmpxchg8b/16b.
// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
-bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
- return needsCmpXchgNb(PTy->getElementType());
+ return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
}
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
- const Type *MemType = AI->getType();
+ Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
- return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
- : AtomicRMWExpansionKind::None;
+ return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
}
AtomicRMWInst::BinOp Op = AI->getOperation();
@@ -17869,14 +19336,14 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
- return AtomicRMWExpansionKind::None;
+ return AtomicExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
- return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
- : AtomicRMWExpansionKind::None;
+ return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+ : AtomicExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
@@ -17884,7 +19351,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::UMin:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
- return AtomicRMWExpansionKind::CmpXChg;
+ return AtomicExpansionKind::CmpXChg;
}
}
@@ -17898,7 +19365,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) {
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
- const Type *MemType = AI->getType();
+ Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// harmful as it introduces a mfence.
@@ -17926,7 +19393,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// lowered to just a load without a fence. A mfence flushes the store buffer,
// making the optimization clearly correct.
// FIXME: it is required if isAtLeastRelease(Order) but it is not clear
- // otherwise, we might be able to be more agressive on relaxed idempotent
+ // otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SynchScope == SingleThread)
@@ -18043,7 +19510,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
SDValue InVec = Op->getOperand(0);
SDLoc dl(Op);
unsigned NumElts = SrcVT.getVectorNumElements();
- EVT SVT = SrcVT.getVectorElementType();
+ MVT SVT = SrcVT.getVectorElementType();
// Widen the vector in input in the case of MVT::v2i32.
// Example: from MVT::v2i32 to MVT::v4i32.
@@ -18103,7 +19570,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
// chunks, thus directly computes the pop count for v2i64 and v4i64.
if (EltVT == MVT::i64) {
SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
- V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
return DAG.getBitcast(VT, V);
}
@@ -18119,9 +19587,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
// Do the horizontal sums into two v2i64s.
Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
- Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+ MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+ Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, Low), Zeros);
- High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+ High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, High), Zeros);
// Merge them together.
@@ -18311,7 +19780,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- assert(Op.getValueType().isVector() &&
+ assert(Op.getSimpleValueType().isVector() &&
"We only do custom lowering for vector population count.");
return LowerVectorCTPOP(Op, Subtarget, DAG);
}
@@ -18357,7 +19826,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getNode()->getSimpleValueType(0);
+ MVT VT = Op.getNode()->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -18435,31 +19904,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
}
+/// Widen a vector input to a vector of NVT. The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+ bool FillWithZeroes = false) {
+ // Check if InOp already has the right width.
+ MVT InVT = InOp.getSimpleValueType();
+ if (InVT == NVT)
+ return InOp;
+
+ if (InOp.isUndef())
+ return DAG.getUNDEF(NVT);
+
+ assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "input and widen element type must match");
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned WidenNumElts = NVT.getVectorNumElements();
+ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+ "Unexpected request for vector widening");
+
+ EVT EltVT = NVT.getVectorElementType();
+
+ SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
+ if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned i = 0; i < InNumElts; ++i)
+ Ops.push_back(InOp.getOperand(i));
+
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
+ for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+ Ops.push_back(FillVal);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+ }
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+ DAG.getUNDEF(NVT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+ InOp, DAG.getIntPtrConstant(0, dl));
+}
+
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(Subtarget->hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values.
+ // If the "scatter" has 2 return values, it is already handled.
+ if (Op.getNode()->getNumValues() == 2)
+ return Op;
+
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
- EVT VT = N->getValue().getValueType();
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
- // X86 scatter kills mask register, so its type should be added to
- // the list of return values
- if (N->getNumValues() == 1) {
- SDValue Index = N->getIndex();
- if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
- !Index.getValueType().is512BitVector())
+ SDValue NewScatter;
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+ MVT MemVT = N->getMemoryVT().getSimpleVT();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+ // The v2i32 value was promoted to v2i64.
+ // Now we "redo" the type legalizer's work and widen the original
+ // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+ // with a shuffle.
+ assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+ "Unexpected memory type");
+ int ShuffleMask[] = {0, 2, -1, -1};
+ Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+ DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+ // Now we have 4 elements instead of 2.
+ // Expand the index.
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+
+ // Expand the mask with zeroes
+ // Mask may be <2 x i64> or <2 x i1> at this moment
+ assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+ "Unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ VT = MVT::v4i32;
+ }
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (IndexVT == MVT::v8i32)
+ // Just extend index
Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ else {
+ // The minimal number of elts in scatter is 8
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ // Use original index here, do not modify the index twice
+ Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ // Use the original mask here, do not modify the mask twice
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+ // The value that should be stored
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src = ExtendToType(Src, NewVT, DAG);
+ }
+ }
+ // If the mask is "wide" at this point - truncate it to i1 vector
+ MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+ // The mask is killed by scatter, add it to the values
+ SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 0);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
- SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
- SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
- return SDValue(NewScatter.getNode(), 0);
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ SDValue Src0 = N->getSrc0();
+ Src0 = ExtendToType(Src0, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+ N->getBasePtr(), Mask, Src0,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType());
+
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+ SDValue DataToStore = N->getValue();
+ MVT VT = DataToStore.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->isTruncatingStore());
}
return Op;
}
@@ -18470,17 +20111,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
- EVT VT = Op.getValueType();
- assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
SDLoc dl(Op);
-
+ MVT VT = Op.getSimpleValueType();
SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Src0 = N->getValue();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
- !Index.getValueType().is512BitVector()) {
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
- DAG.UpdateNodeOperands(N, Ops);
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (NumElts == 8) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ return Op;
+ }
+
+ // Minimal number of elements in Gather
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+ // The pass-thru value
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src0 = ExtendToType(Src0, NewVT, DAG);
+
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
}
return Op;
}
@@ -18572,6 +20255,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SETCCE: return LowerSETCCE(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
@@ -18592,12 +20276,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
- case ISD::CTLZ: return LowerCTLZ(Op, DAG);
- case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
- case ISD::CTTZ: return LowerCTTZ(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::UMUL_LOHI:
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
+ case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
@@ -18615,7 +20301,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::SMAX:
+ case ISD::SMIN:
+ case ISD::UMAX:
+ case ISD::UMIN: return LowerMINMAX(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
+ case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
@@ -18634,14 +20326,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::AVG: {
+ // Legalize types for X86ISD::AVG by expanding vectors.
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+
+ auto InVT = N->getValueType(0);
+ auto InVTSize = InVT.getSizeInBits();
+ const unsigned RegSize =
+ (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+ assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+ "512-bit vector requires AVX512");
+ assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+ "256-bit vector requires AVX2");
+
+ auto ElemVT = InVT.getVectorElementType();
+ auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+ RegSize / ElemVT.getSizeInBits());
+ assert(RegSize % InVT.getSizeInBits() == 0);
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+ Ops[0] = N->getOperand(0);
+ SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+ Ops[0] = N->getOperand(1);
+ SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+ SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+ Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+ DAG.getIntPtrConstant(0, dl)));
+ return;
+ }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
case X86ISD::FMAXC:
case X86ISD::FMAX: {
EVT VT = N->getValueType(0);
- if (VT != MVT::v2f32)
- llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+ assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
SDValue UNDEF = DAG.getUNDEF(VT);
SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(0), UNDEF);
@@ -18668,17 +20389,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_TO_SINT:
- // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert
- // (FP_TO_SINT (load f16)) to FP_TO_INT*.
- if (N->getOperand(0).getValueType() == MVT::f16)
- break;
- // fallthrough
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
- if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
- return;
-
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
@@ -18707,6 +20420,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
+ // TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
return;
@@ -18740,6 +20454,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
}
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+ Results.push_back(V);
+ return;
+ }
case ISD::READCYCLECOUNTER: {
return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
Results);
@@ -18748,7 +20467,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
- EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+ MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(0, dl, HalfT));
@@ -18884,6 +20603,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::IRET: return "X86ISD::IRET";
case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
@@ -18910,6 +20630,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::ABS: return "X86ISD::ABS";
+ case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
case X86ISD::FMIN: return "X86ISD::FMIN";
@@ -18937,12 +20658,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VZEXT: return "X86ISD::VZEXT";
case X86ISD::VSEXT: return "X86ISD::VSEXT";
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
- case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM";
+ case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
+ case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
+ case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
@@ -18978,6 +20701,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TESTM: return "X86ISD::TESTM";
case X86ISD::TESTNM: return "X86ISD::TESTNM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
+ case X86ISD::KTEST: return "X86ISD::KTEST";
case X86ISD::PACKSS: return "X86ISD::PACKSS";
case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
@@ -19000,6 +20724,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
@@ -19009,11 +20734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
+ case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
@@ -19022,10 +20749,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SFENCE: return "X86ISD::SFENCE";
case X86ISD::LFENCE: return "X86ISD::LFENCE";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
- case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
+ case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
+ case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
+ case X86ISD::VPROT: return "X86ISD::VPROT";
+ case X86ISD::VPROTI: return "X86ISD::VPROTI";
+ case X86ISD::VPSHA: return "X86ISD::VPSHA";
+ case X86ISD::VPSHL: return "X86ISD::VPSHL";
+ case X86ISD::VPCOM: return "X86ISD::VPCOM";
+ case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
@@ -19038,7 +20772,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
- case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE";
+ case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
+ case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
@@ -19064,6 +20800,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
+ case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
+ case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
}
return nullptr;
}
@@ -19218,7 +20956,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
- if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
+ if (!Subtarget->hasAnyFMA())
return false;
VT = VT.getScalarType();
@@ -19253,11 +20991,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
return false;
// Not for i1 vectors
- if (VT.getScalarType() == MVT::i1)
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
return false;
// Very little shuffling can be done for 64-bit vectors right now.
- if (VT.getSizeInBits() == 64)
+ if (VT.getSimpleVT().getSizeInBits() == 64)
return false;
// We only care that the types being shuffled are legal. The lowering can
@@ -19282,8 +21020,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
+ MachineFunction::iterator I = ++MBB->getIterator();
// For the v = xbegin(), we generate
//
@@ -19531,8 +21268,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
// Insert the new basic blocks
MF->insert(MBBIter, offsetMBB);
@@ -19702,8 +21438,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
// stores were performed.
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineFunction *F = MBB->getParent();
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(MBBIter, XMMSaveMBB);
@@ -19727,7 +21462,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
- if (!Subtarget->isTargetWin64()) {
+ if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -19744,9 +21479,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
- MachineMemOperand *MMO =
- F->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+ MachineMemOperand *MMO = F->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
/*Size=*/16, /*Align=*/16);
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
@@ -19800,6 +21534,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
return true;
}
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
@@ -19811,8 +21578,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -19823,8 +21589,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
- // We also lower double CMOVs:
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2, we lower cascaded CMOVs such as
+ //
// (CMOV (CMOV F, T, cc1), T, cc2)
+ //
// to two successives branches. For that, we look for another CMOV as the
// following instruction.
//
@@ -19890,19 +21689,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// .LBB5_4:
// retq
//
- MachineInstr *NextCMOV = nullptr;
+ MachineInstr *CascadedCMOV = nullptr;
+ MachineInstr *LastCMOV = MI;
+ X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator NextMIIt =
std::next(MachineBasicBlock::iterator(MI));
- if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition.
+ while (NextMIIt != BB->end() &&
+ isCMOVPseudo(NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ ++NextMIIt;
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == MI &&
+ NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
- NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
- NextCMOV = &*NextMIIt;
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+ CascadedCMOV = &*NextMIIt;
+ }
MachineBasicBlock *jcc1MBB = nullptr;
- // If we have a double CMOV, we lower it to two successive branches to
+ // If we have a cascaded CMOV, we lower it to two successive branches to
// the same block. EFLAGS is used by both, so mark it as live in the second.
- if (NextCMOV) {
+ if (CascadedCMOV) {
jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, jcc1MBB);
jcc1MBB->addLiveIn(X86::EFLAGS);
@@ -19917,7 +21739,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+ MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
@@ -19926,12 +21748,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
- if (NextCMOV) {
- // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+ if (CascadedCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
BB->addSuccessor(jcc1MBB);
// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
@@ -19946,13 +21768,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
- unsigned Opc =
- X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ unsigned Opc = X86::GetCondBranchFromCond(CC);
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
- if (NextCMOV) {
+ if (CascadedCMOV) {
unsigned Opc2 = X86::GetCondBranchFromCond(
- (X86::CondCode)NextCMOV->getOperand(3).getImm());
+ (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
}
@@ -19964,28 +21785,110 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
- MachineInstrBuilder MIB =
- BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
- MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
- .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
- // If we have a double CMOV, the second Jcc provides the same incoming
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg).addMBB(copy0MBB)
+ .addReg(Op2Reg).addMBB(thisMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // If we have a cascaded CMOV, the second Jcc provides the same incoming
// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
- if (NextCMOV) {
+ if (CascadedCMOV) {
MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
- DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+ DL, TII->get(TargetOpcode::COPY),
+ CascadedCMOV->getOperand(0).getReg())
.addReg(MI->getOperand(0).getReg());
- NextCMOV->eraseFromParent();
+ CascadedCMOV->eraseFromParent();
}
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ // Now remove the CMOV(s).
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+ (MIIt++)->eraseFromParent();
+
return sinkMBB;
}
MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ // Combine the following atomic floating-point modification pattern:
+ // a.store(reg OP a.load(acquire), release)
+ // Transform them into:
+ // OPss (%gpr), %xmm
+ // movss %xmm, (%gpr)
+ // Or sd equivalent for 64-bit operations.
+ unsigned MOp, FOp;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+ case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
+ case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+ }
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineOperand MSrc = MI->getOperand(0);
+ unsigned VSrc = MI->getOperand(5).getReg();
+ const MachineOperand &Disp = MI->getOperand(3);
+ MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
+ bool hasDisp = Disp.isGlobal() || Disp.isImm();
+ if (hasDisp && MSrc.isReg())
+ MSrc.setIsKill(false);
+ MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
+ .addOperand(/*Base=*/MSrc)
+ .addImm(/*Scale=*/1)
+ .addReg(/*Index=*/0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+ .addReg(0);
+ MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
+ MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+ .addReg(VSrc)
+ .addOperand(/*Base=*/MSrc)
+ .addImm(/*Scale=*/1)
+ .addReg(/*Index=*/0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+ .addReg(/*Segment=*/0);
+ MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
@@ -20032,8 +21935,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
sizeVReg = MI->getOperand(1).getReg(),
physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
- MachineFunction::iterator MBBIter = BB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++BB->getIterator();
MF->insert(MBBIter, bumpMBB);
MF->insert(MBBIter, mallocMBB);
@@ -20120,14 +22022,60 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
+ assert(!Subtarget->isTargetMachO());
DebugLoc DL = MI->getDebugLoc();
+ MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
+ *BB->getParent(), *BB, MI, DL, false);
+ MachineBasicBlock *ResumeBB = ResumeMI->getParent();
+ MI->eraseFromParent(); // The pseudo instruction is gone now.
+ return ResumeBB;
+}
- assert(!Subtarget->isTargetMachO());
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
+ DebugLoc DL = MI->getDebugLoc();
- Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI,
- DL);
+ assert(!isAsynchronousEHPersonality(
+ classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+ "SEH does not use catchret!");
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ // Only 32-bit EH needs to worry about manually restoring stack pointers.
+ if (!Subtarget->is32Bit())
+ return BB;
+
+ // C++ EH creates a new target block to hold the restore code, and wires up
+ // the new block to the return destination with a normal JMP_4.
+ MachineBasicBlock *RestoreMBB =
+ MF->CreateMachineBasicBlock(BB->getBasicBlock());
+ assert(BB->succ_size() == 1);
+ MF->insert(std::next(BB->getIterator()), RestoreMBB);
+ RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+ BB->addSuccessor(RestoreMBB);
+ MI->getOperand(0).setMBB(RestoreMBB);
+
+ auto RestoreMBBI = RestoreMBB->begin();
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+ BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+ bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+ // Only 32-bit SEH requires special handling for catchpad.
+ if (IsSEH && Subtarget->is32Bit()) {
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+ }
+ MI->eraseFromParent();
return BB;
}
@@ -20149,6 +22097,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
+ Subtarget->is64Bit() ?
+ Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
@@ -20198,8 +22148,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
+ MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -20225,7 +22174,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
// For v = setjmp(buf), we generate
//
// thisMBB:
- // buf[LabelOffset] = restoreMBB
+ // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
// SjLjSetup restoreMBB
//
// mainMBB:
@@ -20245,6 +22194,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MF->insert(I, mainMBB);
MF->insert(I, sinkMBB);
MF->push_back(restoreMBB);
+ restoreMBB->setHasAddressTaken();
MachineInstrBuilder MIB;
@@ -20511,35 +22461,44 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
return BB;
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
+ case X86::CATCHRET:
+ return EmitLoweredCatchRet(MI, BB);
+ case X86::CATCHPAD:
+ return EmitLoweredCatchPad(MI, BB);
case X86::SEG_ALLOCA_32:
case X86::SEG_ALLOCA_64:
return EmitLoweredSegAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
- case X86::CMOV_GR8:
case X86::CMOV_FR32:
case X86::CMOV_FR64:
- case X86::CMOV_V4F32:
+ case X86::CMOV_FR128:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
- case X86::CMOV_V8F32:
+ case X86::CMOV_V4F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
case X86::CMOV_V8F64:
case X86::CMOV_V8I64:
- case X86::CMOV_GR16:
- case X86::CMOV_GR32:
- case X86::CMOV_RFP32:
- case X86::CMOV_RFP64:
- case X86::CMOV_RFP80:
case X86::CMOV_V8I1:
case X86::CMOV_V16I1:
case X86::CMOV_V32I1:
case X86::CMOV_V64I1:
return EmitLoweredSelect(MI, BB);
+ case X86::RELEASE_FADD32mr:
+ case X86::RELEASE_FADD64mr:
+ return EmitLoweredAtomicFP(MI, BB);
+
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
@@ -20793,7 +22752,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
- return Op.getValueType().getScalarType().getSizeInBits();
+ return Op.getValueType().getScalarSizeInBits();
// Fallback case.
return 1;
@@ -20814,39 +22773,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
-/// same as extracting the high 128-bit part of 256-bit vector and then
-/// inserting the result into the low part of a new 256-bit vector
-static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
-/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
-/// same as extracting the low 128-bit part of 256-bit vector and then
-/// inserting the result into the high part of a new 256-bit vector
-static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget* Subtarget) {
@@ -20854,7 +22782,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
@@ -20920,24 +22848,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, InsV);
}
- //===--------------------------------------------------------------------===//
- // Combine some shuffles into subvector extracts and inserts:
- //
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (isShuffleHigh128VectorInsertLow(SVOp)) {
- SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (isShuffleLow128VectorInsertHigh(SVOp)) {
- SDValue V = Extract128BitVector(V1, 0, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
return SDValue();
}
@@ -20966,10 +22876,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
MVT RootVT = Root.getSimpleValueType();
SDLoc DL(Root);
- // Just remove no-op shuffle masks.
if (Mask.size() == 1) {
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
- /*AddTo*/ true);
+ int Index = Mask[0];
+ assert((Index >= 0 || Index == SM_SentinelUndef ||
+ Index == SM_SentinelZero) &&
+ "Invalid shuffle index found!");
+
+ // We may end up with an accumulated mask of size 1 as a result of
+ // widening of shuffle operands (see function canWidenShuffleElements).
+ // If the only shuffle index is equal to SM_SentinelZero then propagate
+ // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+ // mask, and therefore the entire chain of shuffles can be folded away.
+ if (Index == SM_SentinelZero)
+ DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+ else
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+ /*AddTo*/ true);
return true;
}
@@ -20985,7 +22907,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// doesn't preclude something switching to the shorter encoding post-RA.
//
// FIXME: Should teach these routines about AVX vector widths.
- if (FloatDomain && VT.getSizeInBits() == 128) {
+ if (FloatDomain && VT.is128BitVector()) {
if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
@@ -21049,7 +22971,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain && VT.getSizeInBits() == 128 &&
+ if (!FloatDomain && VT.is128BitVector() &&
(Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
@@ -21226,26 +23148,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
// See if we can recurse into the operand to combine more things.
switch (Op.getOpcode()) {
- case X86ISD::PSHUFB:
- HasPSHUFB = true;
- case X86ISD::PSHUFD:
- case X86ISD::PSHUFHW:
- case X86ISD::PSHUFLW:
- if (Op.getOperand(0).hasOneUse() &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ case X86ISD::PSHUFB:
+ HasPSHUFB = true;
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ if (Op.getOperand(0).hasOneUse() &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH:
- assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
- // We can't check for single use, we have to check that this shuffle is the only user.
- if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ assert(Op.getOperand(0) == Op.getOperand(1) &&
+ "We only combine unary shuffles!");
+ // We can't check for single use, we have to check that this shuffle is the
+ // only user.
+ if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
}
// Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -21360,8 +23284,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
- if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
- V.getSimpleValueType().getScalarType() != MVT::i16)
+ if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+ V.getSimpleValueType().getVectorElementType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
@@ -21438,7 +23362,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
///
/// We walk up the chain, skipping shuffles of the other half and looking
/// through shuffles which switch halves trying to find a shuffle of the same
@@ -21520,6 +23445,41 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
Mask = getPSHUFShuffleMask(N);
assert(Mask.size() == 4);
break;
+ case X86ISD::UNPCKL: {
+ // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+ // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+ // moves upper half elements into the lower half part. For example:
+ //
+ // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+ // undef:v16i8
+ // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+ //
+ // will be combined to:
+ //
+ // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+ // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+ // happen due to advanced instructions.
+ if (!VT.is128BitVector())
+ return SDValue();
+
+ auto Op0 = N.getOperand(0);
+ auto Op1 = N.getOperand(1);
+ if (Op0.getOpcode() == ISD::UNDEF &&
+ Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 8> ExpectedMask(NumElts, -1);
+ std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+ NumElts / 2);
+
+ auto ShufOp = Op1.getOperand(0);
+ if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+ }
+ return SDValue();
+ }
default:
return SDValue();
}
@@ -21535,7 +23495,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
- assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
+ assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
return SDValue(); // We combined away this shuffle, so we're done.
@@ -21624,14 +23584,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
return SDValue();
auto *SVN = cast<ShuffleVectorSDNode>(N);
- ArrayRef<int> Mask = SVN->getMask();
+ SmallVector<int, 8> Mask;
+ for (int M : SVN->getMask())
+ Mask.push_back(M);
+
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
- // We require the first shuffle operand to be the SUB node, and the second to
- // be the ADD node.
- // FIXME: We should support the commuted patterns.
- if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
+ // We require the first shuffle operand to be the FSUB node, and the second to
+ // be the FADD node.
+ if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
return SDValue();
// If there are other uses of these operations we can't fold them.
@@ -21682,7 +23647,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return AddSub;
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
- if (Subtarget->hasFp256() && VT.is256BitVector() &&
+ if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
N->getOpcode() == ISD::VECTOR_SHUFFLE)
return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
@@ -21866,21 +23831,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
-/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-/// special and don't usually play with other vector types, it's better to
-/// handle them early to be sure we emit efficient code by avoiding
-/// store-load conversions.
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
- if (N->getValueType(0) != MVT::x86mmx ||
- N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
- N->getOperand(0)->getValueType(0) != MVT::v2i32)
- return SDValue();
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
- SDValue V = N->getOperand(0);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
- if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
- N->getValueType(0), V.getOperand(0));
+ // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+ // special and don't usually play with other vector types, it's better to
+ // handle them early to be sure we emit efficient code by avoiding
+ // store-load conversions.
+ if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+ N0.getValueType() == MVT::v2i32 &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N00.getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+ }
+
+ // Convert a bitcasted integer logic operation that has one bitcasted
+ // floating-point operand and one constant operand into a floating-point
+ // logic operation. This may create a load of the constant, but that is
+ // cheaper than materializing the constant in an integer register and
+ // transferring it to an SSE register or transferring the SSE operand to
+ // integer register and back.
+ unsigned FPOpcode;
+ switch (N0.getOpcode()) {
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ default: return SDValue();
+ }
+ if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+ (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+ isa<ConstantSDNode>(N0.getOperand(1)) &&
+ N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getOperand(0).getValueType() == VT) {
+ SDValue N000 = N0.getOperand(0).getOperand(0);
+ SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+ return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+ }
return SDValue();
}
@@ -21910,26 +23899,26 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
InputVector.getNode()->getOperand(0));
// The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
- SDValue MMXSrcOp = MMXSrc.getOperand(0);
if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
- MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
- MMXSrcOp.getOpcode() == ISD::BITCAST &&
- MMXSrcOp.getValueType() == MVT::v1i64 &&
- MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- MMXSrcOp.getOperand(0));
+ MMXSrc.getValueType() == MVT::i64) {
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0), MMXSrcOp.getOperand(0));
+ }
}
EVT VT = N->getValueType(0);
- if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
+ if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
InputVector.getOpcode() == ISD::BITCAST &&
- dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
+ isa<ConstantSDNode>(InputVector.getOperand(0))) {
uint64_t ExtractedElt =
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
uint64_t InputValue =
- cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
uint64_t Res = (InputValue >> ExtractedElt) & 1;
return DAG.getConstant(Res, dl, MVT::i1);
}
@@ -22036,96 +24025,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static std::pair<unsigned, bool>
-matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
- SelectionDAG &DAG, const X86Subtarget *Subtarget) {
- if (!VT.isVector())
- return std::make_pair(0, false);
-
- bool NeedSplit = false;
- switch (VT.getSimpleVT().SimpleTy) {
- default: return std::make_pair(0, false);
- case MVT::v4i64:
- case MVT::v2i64:
- if (!Subtarget->hasVLX())
- return std::make_pair(0, false);
- break;
- case MVT::v64i8:
- case MVT::v32i16:
- if (!Subtarget->hasBWI())
- return std::make_pair(0, false);
- break;
- case MVT::v16i32:
- case MVT::v8i64:
- if (!Subtarget->hasAVX512())
- return std::make_pair(0, false);
- break;
- case MVT::v32i8:
- case MVT::v16i16:
- case MVT::v8i32:
- if (!Subtarget->hasAVX2())
- NeedSplit = true;
- if (!Subtarget->hasAVX())
- return std::make_pair(0, false);
- break;
- case MVT::v16i8:
- case MVT::v8i16:
- case MVT::v4i32:
- if (!Subtarget->hasSSE2())
- return std::make_pair(0, false);
- }
-
- // SSE2 has only a small subset of the operations.
- bool hasUnsigned = Subtarget->hasSSE41() ||
- (Subtarget->hasSSE2() && VT == MVT::v16i8);
- bool hasSigned = Subtarget->hasSSE41() ||
- (Subtarget->hasSSE2() && VT == MVT::v8i16);
-
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- unsigned Opc = 0;
- // Check for x CC y ? x : y.
- if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
- DAG.isEqualTo(RHS, Cond.getOperand(1))) {
- switch (CC) {
- default: break;
- case ISD::SETULT:
- case ISD::SETULE:
- Opc = hasUnsigned ? ISD::UMIN : 0; break;
- case ISD::SETUGT:
- case ISD::SETUGE:
- Opc = hasUnsigned ? ISD::UMAX : 0; break;
- case ISD::SETLT:
- case ISD::SETLE:
- Opc = hasSigned ? ISD::SMIN : 0; break;
- case ISD::SETGT:
- case ISD::SETGE:
- Opc = hasSigned ? ISD::SMAX : 0; break;
- }
- // Check for x CC y ? y : x -- a min/max with reversed arms.
- } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
- DAG.isEqualTo(RHS, Cond.getOperand(0))) {
- switch (CC) {
- default: break;
- case ISD::SETULT:
- case ISD::SETULE:
- Opc = hasUnsigned ? ISD::UMAX : 0; break;
- case ISD::SETUGT:
- case ISD::SETUGE:
- Opc = hasUnsigned ? ISD::UMIN : 0; break;
- case ISD::SETLT:
- case ISD::SETLE:
- Opc = hasSigned ? ISD::SMAX : 0; break;
- case ISD::SETGT:
- case ISD::SETGE:
- Opc = hasSigned ? ISD::SMIN : 0; break;
- }
- }
-
- return std::make_pair(Opc, NeedSplit);
-}
-
static SDValue
transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
@@ -22189,7 +24088,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+ VT != MVT::f80 && VT != MVT::f128 &&
+ (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -22535,32 +24435,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // Try to match a min/max vector operation.
- if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
- std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
- unsigned Opc = ret.first;
- bool NeedSplit = ret.second;
-
- if (Opc && NeedSplit) {
- unsigned NumElems = VT.getVectorNumElements();
- // Extract the LHS vectors
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
-
- // Extract the RHS vectors
- SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
- SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
-
- // Create min/max for each subvector
- LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
- RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
-
- // Merge the result
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
- } else if (Opc)
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
-
// Simplify vector selection if condition value type matches vselect
// operand type
if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
@@ -22635,7 +24509,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
- unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+ unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
// Don't optimize vector selects that map to mask-registers.
if (BitWidth == 1)
@@ -22656,14 +24530,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// FIXME: We don't support i16-element blends currently. We could and
// should support them by making *all* the bits in the condition be set
// rather than just the high bit and using an i8-element blend.
- if (VT.getScalarType() == MVT::i16)
+ if (VT.getVectorElementType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
- if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ if (VT.is128BitVector() && !Subtarget->hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
- if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
- !Subtarget->hasAVX2())
+ if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
@@ -22773,12 +24646,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
- ConstantSDNode *CS;
- if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
- CS->getZExtValue() == 1)
+ if (isOneConstant(SetCC.getOperand(0)))
OpIdx = 1;
- if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
- CS->getZExtValue() == 1)
+ if (isOneConstant(SetCC.getOperand(1)))
OpIdx = 0;
if (OpIdx == -1)
break;
@@ -22857,8 +24727,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
X86::CondCode &CC1, SDValue &Flags,
bool &isAnd) {
if (Cond->getOpcode() == X86ISD::CMP) {
- ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
- if (!CondOp1C || !CondOp1C->isNullValue())
+ if (!isNullConstant(Cond->getOperand(1)))
return false;
Cond = Cond->getOperand(0);
@@ -23102,106 +24971,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- switch (IntNo) {
- default: return SDValue();
- // SSE/AVX/AVX2 blend intrinsics.
- case Intrinsic::x86_avx2_pblendvb:
- // Don't try to simplify this intrinsic if we don't have AVX2.
- if (!Subtarget->hasAVX2())
- return SDValue();
- // FALL-THROUGH
- case Intrinsic::x86_avx_blendv_pd_256:
- case Intrinsic::x86_avx_blendv_ps_256:
- // Don't try to simplify this intrinsic if we don't have AVX.
- if (!Subtarget->hasAVX())
- return SDValue();
- // FALL-THROUGH
- case Intrinsic::x86_sse41_blendvps:
- case Intrinsic::x86_sse41_blendvpd:
- case Intrinsic::x86_sse41_pblendvb: {
- SDValue Op0 = N->getOperand(1);
- SDValue Op1 = N->getOperand(2);
- SDValue Mask = N->getOperand(3);
-
- // Don't try to simplify this intrinsic if we don't have SSE4.1.
- if (!Subtarget->hasSSE41())
- return SDValue();
-
- // fold (blend A, A, Mask) -> A
- if (Op0 == Op1)
- return Op0;
- // fold (blend A, B, allZeros) -> A
- if (ISD::isBuildVectorAllZeros(Mask.getNode()))
- return Op0;
- // fold (blend A, B, allOnes) -> B
- if (ISD::isBuildVectorAllOnes(Mask.getNode()))
- return Op1;
-
- // Simplify the case where the mask is a constant i32 value.
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
- if (C->isNullValue())
- return Op0;
- if (C->isAllOnesValue())
- return Op1;
- }
-
- return SDValue();
- }
-
- // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx2_psra_d: {
- SDValue Op0 = N->getOperand(1);
- SDValue Op1 = N->getOperand(2);
- EVT VT = Op0.getValueType();
- assert(VT.isVector() && "Expected a vector type!");
-
- if (isa<BuildVectorSDNode>(Op1))
- Op1 = Op1.getOperand(0);
-
- if (!isa<ConstantSDNode>(Op1))
- return SDValue();
-
- EVT SVT = VT.getVectorElementType();
- unsigned SVTBits = SVT.getSizeInBits();
-
- ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
- const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
- uint64_t ShAmt = C.getZExtValue();
-
- // Don't try to convert this shift into a ISD::SRA if the shift
- // count is bigger than or equal to the element size.
- if (ShAmt >= SVTBits)
- return SDValue();
-
- // Trivial case: if the shift count is zero, then fold this
- // into the first operand.
- if (ShAmt == 0)
- return Op0;
-
- // Replace this packed shift intrinsic with a target independent
- // shift dag node.
- SDLoc DL(N);
- SDValue Splat = DAG.getConstant(C, DL, VT);
- return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat);
- }
- }
-}
-
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ // An imul is usually smaller than the alternative sequence.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
@@ -23228,9 +25006,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
MulAmt1 = 3;
MulAmt2 = MulAmt / 3;
}
+
+ SDLoc DL(N);
+ SDValue NewMul;
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
- SDLoc DL(N);
if (isPowerOf2_64(MulAmt2) &&
!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -23239,7 +25019,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
// is an add.
std::swap(MulAmt1, MulAmt2);
- SDValue NewMul;
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
@@ -23253,10 +25032,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, DL, VT));
+ }
+
+ if (!NewMul) {
+ assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+ && "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ if (isPowerOf2_64(MulAmt - 1))
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt - 1), DL,
+ MVT::i8)));
+ else if (isPowerOf2_64(MulAmt + 1))
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+ N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt + 1),
+ DL, MVT::i8)), N->getOperand(0));
+ }
+
+ if (NewMul)
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, NewMul, false);
- }
+
return SDValue();
}
@@ -23272,18 +25072,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
- ((N00.getOpcode() == ISD::ANY_EXTEND ||
- N00.getOpcode() == ISD::ZERO_EXTEND) &&
- N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
- APInt ShAmt = N1C->getAPIntValue();
- Mask = Mask.shl(ShAmt);
- if (Mask != 0) {
- SDLoc DL(N);
- return DAG.getNode(ISD::AND, DL, VT,
- N00, DAG.getConstant(Mask, DL, VT));
- }
+ APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt ShAmt = N1C->getAPIntValue();
+ Mask = Mask.shl(ShAmt);
+ bool MaskOK = false;
+ // We can handle cases concerning bit-widening nodes containing setcc_c if
+ // we carefully interrogate the mask to make sure we are semantics
+ // preserving.
+ // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+ // of the underlying setcc_c operation if the setcc_c was zero extended.
+ // Consider the following example:
+ // zext(setcc_c) -> i32 0x0000FFFF
+ // c1 -> i32 0x0000FFFF
+ // c2 -> i32 0x00000001
+ // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+ // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = true;
+ } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+ N00.getOpcode() == ISD::ANY_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+ }
+ if (MaskOK && Mask != 0) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
}
}
@@ -23304,6 +25120,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned Size = VT.getSizeInBits();
+
+ // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+ // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+ // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+ // depending on sign of (SarConst - [56,48,32,24,16])
+
+ // sexts in X86 are MOVs. The MOVs have the same code size
+ // as above SHIFTs (only SHIFT on 1 has lower code size).
+ // However the MOVs have 2 advantages to a SHIFT:
+ // 1. MOVs can write to a register that differs from source
+ // 2. MOVs accept memory operands
+
+ if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+ N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+ N0.getOperand(1).getOpcode() != ISD::Constant)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+ APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+ EVT CVT = N1.getValueType();
+
+ if (SarConst.isNegative())
+ return SDValue();
+
+ for (MVT SVT : MVT::integer_valuetypes()) {
+ unsigned ShiftSize = SVT.getSizeInBits();
+ // skipping types without corresponding sext/zext and
+ // ShlConst that is not one of [56,48,32,24,16]
+ if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+ continue;
+ SDLoc DL(N);
+ SDValue NN =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+ SarConst = SarConst - (Size - ShiftSize);
+ if (SarConst == 0)
+ return NN;
+ else if (SarConst.isNegative())
+ return DAG.getNode(ISD::SHL, DL, VT, NN,
+ DAG.getConstant(-SarConst, DL, CVT));
+ else
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
+ }
+ return SDValue();
+}
+
/// \brief Returns a vector of 0s if the node in input is a vector logical
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
@@ -23321,14 +25190,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
APInt ShiftAmt = AmtSplat->getAPIntValue();
- unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+ unsigned MaxAmount =
+ VT.getSimpleVT().getVectorElementType().getSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
// if the shift amount is bigger than or equal to
// the element size. The constant shift amount will be
// encoded as a 8-bit immediate.
if (ShiftAmt.trunc(8).uge(MaxAmount))
- return getZeroVector(VT, Subtarget, DAG, DL);
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
}
return SDValue();
@@ -23342,6 +25212,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
if (SDValue V = PerformSHLCombine(N, DAG))
return V;
+ if (N->getOpcode() == ISD::SRA)
+ if (SDValue V = PerformSRACombine(N, DAG))
+ return V;
+
// Try to fold this logical shift into a zero vector.
if (N->getOpcode() != ISD::SRA)
if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
@@ -23537,7 +25411,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
// Set N0 and N1 to hold the inputs to the new wide operation.
N0 = N0->getOperand(0);
if (RHSConstSplat) {
- N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
SDValue(RHSConstSplat, 0));
SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
@@ -23552,9 +25426,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
case ISD::ANY_EXTEND:
return Op;
case ISD::ZERO_EXTEND: {
- unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+ unsigned InBits = NarrowVT.getScalarSizeInBits();
APInt Mask = APInt::getAllOnesValue(InBits);
- Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+ Mask = Mask.zext(VT.getScalarSizeInBits());
return DAG.getNode(ISD::AND, DL, VT,
Op, DAG.getConstant(Mask, DL, VT));
}
@@ -23656,6 +25530,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(N0.getValueType(), NewShuffle);
}
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ unsigned FPOpcode = ISD::DELETED_NODE;
+ if (N->getOpcode() == ISD::AND)
+ FPOpcode = X86ISD::FAND;
+ else if (N->getOpcode() == ISD::OR)
+ FPOpcode = X86ISD::FOR;
+ else if (N->getOpcode() == ISD::XOR)
+ FPOpcode = X86ISD::FXOR;
+
+ assert(FPOpcode != ISD::DELETED_NODE &&
+ "Unexpected input node for FP logic conversion");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+ (Subtarget->hasSSE2() && VT == MVT::i64))) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+ if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -23668,6 +25577,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -23728,6 +25640,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
@@ -23799,7 +25714,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
if (!Subtarget->hasSSE41())
return SDValue();
- EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+ MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
@@ -23813,9 +25728,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize =
- MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
+ bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
@@ -23913,17 +25826,188 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+// Try to turn tests against the signbit in the form of:
+// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+// into:
+// SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+ // This is only worth doing if the output type is i8.
+ if (N->getValueType(0) != MVT::i8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // We should be performing an xor against a truncated shift.
+ if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+ return SDValue();
+
+ // Make sure we are performing an xor against one.
+ if (!isOneConstant(N1))
+ return SDValue();
+
+ // SetCC on x86 zero extends so only act on this if it's a logical shift.
+ SDValue Shift = N0.getOperand(0);
+ if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+ return SDValue();
+
+ // Make sure we are truncating from one of i16, i32 or i64.
+ EVT ShiftTy = Shift.getValueType();
+ if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+ return SDValue();
+
+ // Make sure the shift amount extracts the sign bit.
+ if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+ Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+ return SDValue();
+
+ // Create a greater-than comparison against -1.
+ // N.B. Using SETGE against 0 works but we want a canonical looking
+ // comparison, using SETGT matches up with what TranslateX86CC.
+ SDLoc DL(N);
+ SDValue ShiftOp = Shift.getOperand(0);
+ EVT ShiftOpTy = ShiftOp.getValueType();
+ SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+ DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+ return Cond;
+}
+
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+ return RV;
+
if (Subtarget->hasCMov())
if (SDValue RV = performIntegerAbsCombine(N, DAG))
return RV;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ return SDValue();
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget, SDLoc DL) {
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue();
+ EVT InVT = In.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+ isPowerOf2_32(NumElems)))
+ return SDValue();
+
+ // InScalarVT is the intermediate type in AVG pattern and it should be greater
+ // than the original input type (i8/i16).
+ EVT InScalarVT = InVT.getVectorElementType();
+ if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+ return SDValue();
+
+ if (Subtarget->hasAVX512()) {
+ if (VT.getSizeInBits() > 512)
+ return SDValue();
+ } else if (Subtarget->hasAVX2()) {
+ if (VT.getSizeInBits() > 256)
+ return SDValue();
+ } else {
+ if (VT.getSizeInBits() > 128)
+ return SDValue();
+ }
+
+ // Detect the following pattern:
+ //
+ // %1 = zext <N x i8> %a to <N x i32>
+ // %2 = zext <N x i8> %b to <N x i32>
+ // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+ // %4 = add nuw nsw <N x i32> %3, %2
+ // %5 = lshr <N x i32> %N, <i32 1 x N>
+ // %6 = trunc <N x i32> %5 to <N x i8>
+ //
+ // In AVX512, the last instruction can also be a trunc store.
+
+ if (In.getOpcode() != ISD::SRL)
+ return SDValue();
+
+ // A lambda checking the given SDValue is a constant vector and each element
+ // is in the range [Min, Max].
+ auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+ BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV || !BV->isConstant())
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+ if (!C)
+ return false;
+ uint64_t Val = C->getZExtValue();
+ if (Val < Min || Val > Max)
+ return false;
+ }
+ return true;
+ };
+
+ // Check if each element of the vector is left-shifted by one.
+ auto LHS = In.getOperand(0);
+ auto RHS = In.getOperand(1);
+ if (!IsConstVectorInRange(RHS, 1, 1))
+ return SDValue();
+ if (LHS.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Detect a pattern of a + b + 1 where the order doesn't matter.
+ SDValue Operands[3];
+ Operands[0] = LHS.getOperand(0);
+ Operands[1] = LHS.getOperand(1);
+
+ // Take care of the case when one of the operands is a constant vector whose
+ // element is in the range [1, 256].
+ if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+ Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+ Operands[0].getOperand(0).getValueType() == VT) {
+ // The pattern is detected. Subtract one from the constant vector, then
+ // demote it and emit X86ISD::AVG instruction.
+ SDValue One = DAG.getConstant(1, DL, InScalarVT);
+ SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
+ SmallVector<SDValue, 8>(NumElems, One));
+ Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+ Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1]);
+ }
+
+ if (Operands[0].getOpcode() == ISD::ADD)
+ std::swap(Operands[0], Operands[1]);
+ else if (Operands[1].getOpcode() != ISD::ADD)
+ return SDValue();
+ Operands[2] = Operands[1].getOperand(0);
+ Operands[1] = Operands[1].getOperand(1);
+
+ // Now we have three operands of two additions. Check that one of them is a
+ // constant vector with ones, and the other two are promoted from i8/i16.
+ for (int i = 0; i < 3; ++i) {
+ if (!IsConstVectorInRange(Operands[i], 1, 1))
+ continue;
+ std::swap(Operands[i], Operands[2]);
+
+ // Check if Operands[0] and Operands[1] are results of type promotion.
+ for (int j = 0; j < 2; ++j)
+ if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+ Operands[j].getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ // The pattern is detected, emit X86ISD::AVG instruction.
+ return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+ Operands[1].getOperand(0));
+ }
+
return SDValue();
}
@@ -23940,10 +26024,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
// into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();
+ bool Fast;
+ unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
- if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+ if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+ Ext == ISD::NON_EXTLOAD &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ AddressSpace, Alignment, &Fast) && !Fast) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
@@ -24012,8 +26099,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
- && "WideVecVT should be legal");
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
}
@@ -24026,8 +26113,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
- for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
- ShuffleVec[i] = NumElems*SizeRatio;
+ for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+ ShuffleVec[i] = NumElems * SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
DAG.getConstant(0, dl, WideVecVT),
&ShuffleVec[0]);
@@ -24055,7 +26142,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
ISD::NON_EXTLOAD);
SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
-
}
/// PerformMSTORECombine - Resolve truncating stores
static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -24073,6 +26159,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
unsigned FromSz = VT.getVectorElementType().getSizeInBits();
unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
// From, To sizes and ElemCount must be pow of two
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for truncating masked store");
@@ -24096,12 +26191,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
- && "WideVecVT should be legal");
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVecVT),
- &ShuffleVec[0]);
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
SDValue NewMask;
SDValue Mask = Mst->getMask();
@@ -24133,8 +26228,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
- return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
- NewMask, StVT, Mst->getMemOperand(), false);
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+ Mst->getBasePtr(), NewMask, StVT,
+ Mst->getMemOperand(), false);
}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -24148,10 +26244,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
+ bool Fast;
+ unsigned AddressSpace = St->getAddressSpace();
unsigned Alignment = St->getAlignment();
- bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
- if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
- StVT == VT && !IsAligned) {
+ if (VT.is256BitVector() && StVT == VT &&
+ TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ AddressSpace, Alignment, &Fast) && !Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
@@ -24178,12 +26276,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
if (St->isTruncatingStore() && VT.isVector()) {
+ // Check if we can detect an AVG pattern from the truncation. If yes,
+ // replace the trunc store by a normal store with the result of X86ISD::AVG
+ // instruction.
+ SDValue Avg =
+ detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
+ if (Avg.getNode())
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getVectorElementType().getSizeInBits();
unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
// From, To sizes and ElemCount must be pow of two
if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
// We are going to use the original vector elt for storing.
@@ -24306,7 +26421,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
if (Subtarget->is64Bit() || F64IsLegal) {
- EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->isInvariant(),
@@ -24539,8 +26654,234 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+ Regs[0].getValueType() == MVT::v2i64));
+ EVT OutVT = N->getValueType(0);
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InVT = Regs[0].getValueType();
+ EVT InSVT = InVT.getVectorElementType();
+ SDLoc DL(N);
+
+ // First, use mask to unset all bits that won't appear in the result.
+ assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+ "OutSVT can only be either i8 or i16.");
+ SDValue MaskVal =
+ DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
+ SDValue MaskVec = DAG.getNode(
+ ISD::BUILD_VECTOR, DL, InVT,
+ SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+ for (auto &Reg : Regs)
+ Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+
+ MVT UnpackedVT, PackedVT;
+ if (OutSVT == MVT::i8) {
+ UnpackedVT = MVT::v8i16;
+ PackedVT = MVT::v16i8;
+ } else {
+ UnpackedVT = MVT::v4i32;
+ PackedVT = MVT::v8i16;
+ }
+
+ // In each iteration, truncate the type by a half size.
+ auto RegNum = Regs.size();
+ for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+ j < e; j *= 2, RegNum /= 2) {
+ for (unsigned i = 0; i < RegNum; i++)
+ Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+ for (unsigned i = 0; i < RegNum / 2; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+ Regs[i * 2 + 1]);
+ }
+
+ // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+ // then extract a subvector as the result since v8i8 is not a legal type.
+ if (OutVT == MVT::v8i8) {
+ Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+ Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+ DAG.getIntPtrConstant(0, DL));
+ return Regs[0];
+ } else if (RegNum > 1) {
+ Regs.resize(RegNum);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+ EVT OutVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+ SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+ for (auto &Reg : Regs) {
+ Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ }
+
+ for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+ Regs[i * 2 + 1]);
+
+ if (Regs.size() > 2) {
+ Regs.resize(Regs.size() / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// diffcult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ EVT InVT = In.getValueType();
+ unsigned NumElems = OutVT.getVectorNumElements();
+
+ // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+ // SSE2, and we need to take care of it specially.
+ // AVX512 provides vpmovdb.
+ if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+ return SDValue();
+
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InSVT = InVT.getVectorElementType();
+ if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget->hasSSSE3() && NumElems == 8 &&
+ ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+ (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Split a long vector into vectors of legal type.
+ unsigned RegNum = InVT.getSizeInBits() / 128;
+ SmallVector<SDValue, 8> SubVec(RegNum);
+ if (InSVT == MVT::i32) {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(i * 4, DL));
+ } else {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(i * 2, DL));
+ }
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+ return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+ else if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+ else
+ return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // Try to detect AVG pattern first.
+ SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
+ Subtarget, SDLoc(N));
+ if (Avg.getNode())
+ return Avg;
+
+ return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ SDValue Arg = N->getOperand(0);
+ SDLoc DL(N);
+
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ // If we're negating a FMUL node on a target with FMA, then we can avoid the
+ // use of a constant by performing (-0 - A*B) instead.
+ // FIXME: Check rounding control flags as well once it becomes available.
+ if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+ Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+ SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Zero);
+ }
+
+ // If we're negating a FMA node, then we can adjust the
+ // instruction to include the extra negation.
+ if (Arg.hasOneUse()) {
+ switch (Arg.getOpcode()) {
+ case X86ISD::FMADD:
+ return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FMSUB:
+ return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMADD:
+ return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ case X86ISD::FNMSUB:
+ return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Arg.getOperand(2));
+ }
+ }
+ return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+ // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+ // These logic operations may be executed in the integer domain.
+ SDLoc dl(N);
+ MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+ SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+ unsigned IntOpcode = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ }
+ SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+ return DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+ }
+ return SDValue();
+}
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
-static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
@@ -24552,7 +26893,8 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
- return SDValue();
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
@@ -24576,8 +26918,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
N->getOperand(0), N->getOperand(1));
}
+static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ if (Subtarget->useSoftFloat())
+ return SDValue();
+
+ // TODO: Check for global or instruction-level "nnan". In that case, we
+ // should be able to lower to FMAX/FMIN alone.
+ // TODO: If an operand is already known to be a NaN or not a NaN, this
+ // should be an optional swap and FMAX/FMIN.
+
+ EVT VT = N->getValueType(0);
+ if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+ return SDValue();
+
+ // This takes at least 3 instructions, so favor a library call when operating
+ // on a scalar and minimizing code size.
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+ DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ // There are 4 possibilities involving NaN inputs, and these are the required
+ // outputs:
+ // Op1
+ // Num NaN
+ // ----------------
+ // Num | Max | Op0 |
+ // Op0 ----------------
+ // NaN | Op1 | NaN |
+ // ----------------
+ //
+ // The SSE FP max/min instructions were not designed for this case, but rather
+ // to implement:
+ // Min = Op1 < Op0 ? Op1 : Op0
+ // Max = Op1 > Op0 ? Op1 : Op0
+ //
+ // So they always return Op0 if either input is a NaN. However, we can still
+ // use those instructions for fmaxnum by selecting away a NaN input.
+
+ // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+ auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+ SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+ // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+ // are NaN, the NaN value of Op1 is the result.
+ auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+ return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
/// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
// FAND(0.0, x) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
@@ -24588,11 +26987,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
- return SDValue();
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
@@ -24603,7 +27003,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
- return SDValue();
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
static SDValue PerformBTCombine(SDNode *N,
@@ -24673,6 +27073,57 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // TODO: This should be valid for other integer types.
+ EVT VT = Sext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ // We need an 'add nsw' feeding into the 'sext'.
+ SDValue Add = Sext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Sext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the 'sext' ahead of the 'add'.
+ int64_t AddConstant = AddOp1->getSExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(true);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -24763,13 +27214,13 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
}
}
- if (!Subtarget->hasFp256())
- return SDValue();
-
- if (VT.isVector() && VT.getSizeInBits() == 256)
+ if (Subtarget->hasAVX() && VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+ return NewAdd;
+
return SDValue();
}
@@ -24783,9 +27234,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT ScalarVT = VT.getScalarType();
- if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
- (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
- !Subtarget->hasAVX512()))
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
return SDValue();
SDValue A = N->getOperand(0);
@@ -24830,8 +27279,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
N0.getOperand(0).hasOneUse()) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (!C || C->getZExtValue() != 1)
+ if (!isOneConstant(N0.getOperand(1)))
return SDValue();
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
@@ -24884,21 +27332,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
- if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
- LHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
- }
+ if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+ LHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
- if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
- RHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
- }
+ if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+ RHS.getOperand(1));
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
+ }
if (VT.getScalarType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
@@ -24936,52 +27382,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
- SelectionDAG &DAG) {
- SDLoc dl(Load);
- MVT VT = Load->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue Addr = Load->getOperand(1);
- SDValue NewAddr = DAG.getNode(
- ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
- DAG.getConstant(Index * EVT.getStoreSize(), dl,
- Addr.getSimpleValueType()));
-
- SDValue NewLoad =
- DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
- DAG.getMachineFunction().getMachineMemOperand(
- Load->getMemOperand(), 0, EVT.getStoreSize()));
- return NewLoad;
-}
-
-static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- SDLoc dl(N);
- MVT VT = N->getOperand(1)->getSimpleValueType(0);
- assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
- "X86insertps is only defined for v4x32");
-
- SDValue Ld = N->getOperand(1);
- if (MayFoldLoad(Ld)) {
- // Extract the countS bits from the immediate so we can get the proper
- // address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, insertps doesn't use
- // countS and just gets an f32 from that address.
- unsigned DestIndex =
- cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
- Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
- }
- return SDValue();
-}
-
static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
SDValue V0 = N->getOperand(0);
SDValue V1 = N->getOperand(1);
@@ -25008,6 +27408,20 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ // Gather and Scatter instructions use k-registers for masks. The type of
+ // the masks is v*i1. So the mask will be truncated anyway.
+ // The SIGN_EXTEND_INREG my be dropped.
+ SDValue Mask = N->getOperand(2);
+ if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[2] = Mask.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ }
+ return SDValue();
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
@@ -25182,7 +27596,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
- if (Op0.getOpcode() == ISD::LOAD) {
+ if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
EVT LdVT = Ld->getValueType(0);
@@ -25357,15 +27771,14 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
}
// Check if we can bypass extracting and re-inserting an element of an input
- // vector. Essentialy:
+ // vector. Essentially:
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
SDValue ExtractedV = V.getOperand(0);
SDValue OrigV = ExtractedV.getOperand(0);
- if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
- if (ExtractIdx->getZExtValue() == 0) {
+ if (isNullConstant(ExtractedV.getOperand(1))) {
MVT OrigVT = OrigV.getSimpleValueType();
// Extract a subvector if necessary...
if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
@@ -25394,7 +27807,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
- case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
@@ -25414,12 +27827,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
+ case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget);
+ case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget);
case X86ISD::FXOR:
- case X86ISD::FOR: return PerformFORCombine(N, DAG);
+ case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
- case X86ISD::FAND: return PerformFANDCombine(N, DAG);
- case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG,
+ Subtarget);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget);
+ case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
case ISD::ANY_EXTEND:
@@ -25447,14 +27865,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
- case ISD::INTRINSIC_WO_CHAIN:
- return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS: {
- if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
- return PerformINSERTPSCombine(N, DAG, Subtarget);
- break;
- }
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
}
return SDValue();
@@ -26084,6 +28497,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::f64:
case MVT::i64:
return std::make_pair(0U, &X86::FR64RegClass);
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
// Vector types.
case MVT::v16i8:
case MVT::v8i16:
@@ -26168,17 +28582,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
unsigned Size = VT.getSizeInBits();
- MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8
- : Size == 16 ? MVT::i16
- : Size == 32 ? MVT::i32
- : Size == 64 ? MVT::i64
- : MVT::Other;
- unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy);
+ if (Size == 1) Size = 8;
+ unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
Res.first = DestReg;
- Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass
- : SimpleTy == MVT::i16 ? &X86::GR16RegClass
- : SimpleTy == MVT::i32 ? &X86::GR32RegClass
+ Res.second = Size == 8 ? &X86::GR8RegClass
+ : Size == 16 ? &X86::GR16RegClass
+ : Size == 32 ? &X86::GR32RegClass
: &X86::GR64RegClass;
assert(Res.second->contains(Res.first) && "Register in register class");
} else {
@@ -26196,6 +28606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
Res.second = &X86::FR32RegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
@@ -26244,6 +28655,15 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
-bool X86TargetLowering::isTargetFTOL() const {
- return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+ // Integer division on x86 is expensive. However, when aggressively optimizing
+ // for code size, we prefer to use a div instruction, as it is usually smaller
+ // than the alternative sequence.
+ // The exception to this is vector division. Since x86 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::MinSize);
+ return OptSize && !VT.isVector();
}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 723d5304495c..a29dc9af54f6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -126,6 +126,9 @@ namespace llvm {
/// 1 is the number of bytes of stack to pop.
RET_FLAG,
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
/// Repeat fill, corresponds to X86::REP_STOSx.
REP_STOS,
@@ -182,6 +185,8 @@ namespace llvm {
/// Compute Sum of Absolute Differences.
PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
/// Bitwise Logical AND NOT of Packed FP values.
ANDNP,
@@ -211,6 +216,8 @@ namespace llvm {
// FP vector get exponent
FGETEXP_RND,
+ // Extract Normalized Mantissas
+ VGETMANT,
// FP Scale
SCALEF,
// Integer add/sub with unsigned saturation.
@@ -236,6 +243,9 @@ namespace llvm {
// Integer absolute value
ABS,
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
/// Floating point max and min.
FMAX, FMIN,
@@ -282,9 +292,8 @@ namespace llvm {
// Vector integer truncate.
VTRUNC,
-
- // Vector integer truncate with mask.
- VTRUNCM,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS, VTRUNCS,
// Vector FP extend.
VFPEXT,
@@ -295,6 +304,9 @@ namespace llvm {
// Vector signed/unsigned integer to double.
CVTDQ2PD, CVTUDQ2PD,
+ // Convert a vector to mask, set bits base on MSB.
+ CVT2MASK,
+
// 128-bit vector logical left / right shift
VSHLDQ, VSRLDQ,
@@ -349,6 +361,7 @@ namespace llvm {
// OR/AND test for masks
KORTEST,
+ KTEST,
// Several flavors of instructions with vector shuffle behaviors.
PACKSS,
@@ -382,12 +395,24 @@ namespace llvm {
VPERMIV3,
VPERMI,
VPERM2X128,
- //Fix Up Special Packed Float32/64 values
+ // Bitwise ternary logic
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values
VFIXUPIMM,
- //Range Restriction Calculation For Packed Pairs of Float32/64 values
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values
VRANGE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP
+ VREDUCE,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits
+ VRNDSCALE,
+ // VFPCLASS - Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // VFPCLASSS - Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
// Broadcast scalar to vector
VBROADCAST,
+ // Broadcast mask to vector
+ VBROADCASTM,
// Broadcast subvector to vector
SUBV_BROADCAST,
// Insert/Extract vector element
@@ -397,13 +422,21 @@ namespace llvm {
/// SSE4A Extraction and Insertion.
EXTRQI, INSERTQI,
+ // XOP variable/immediate rotations
+ VPROT, VPROTI,
+ // XOP arithmetic/logical shifts
+ VPSHA, VPSHL,
+ // XOP signed/unsigned integer comparisons
+ VPCOM, VPCOMU,
+
// Vector multiply packed unsigned doubleword integers
PMULUDQ,
// Vector multiply packed signed doubleword integers
PMULDQ,
// Vector Multiply Packed UnsignedIntegers with Round and Scale
MULHRS,
-
+ // Multiply and Add Packed Integers
+ VPMADDUBSW, VPMADDWD,
// FMA nodes
FMADD,
FNMADD,
@@ -418,7 +451,6 @@ namespace llvm {
FNMSUB_RND,
FMADDSUB_RND,
FMSUBADD_RND,
- RNDSCALE,
// Compress and expand
COMPRESS,
@@ -443,9 +475,6 @@ namespace llvm {
// falls back to heap allocation if not.
SEG_ALLOCA,
- // Windows's _ftol2 runtime routine to do fptoui.
- WIN_FTOL,
-
// Memory barrier
MEMBARRIER,
MFENCE,
@@ -580,15 +609,6 @@ namespace llvm {
bool isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool TailCallOpt);
- /// AVX512 static rounding constants. These need to match the values in
- /// avx512fintrin.h.
- enum STATIC_ROUNDING {
- TO_NEAREST_INT = 0,
- TO_NEG_INF = 1,
- TO_POS_INF = 2,
- TO_ZERO = 3,
- CUR_DIRECTION = 4
- };
}
//===--------------------------------------------------------------------===//
@@ -850,16 +870,7 @@ namespace llvm {
/// register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
- (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
- }
-
- /// Return true if the target uses the MSVC _ftol2 routine for fptoui.
- bool isTargetFTOL() const;
-
- /// Return true if the MSVC _ftol2 routine should be used for fptoui to the
- /// given type.
- bool isIntegerTypeFTOL(EVT VT) const {
- return isTargetFTOL() && VT == MVT::i64;
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
}
/// \brief Returns true if it is beneficial to convert a load of a constant
@@ -879,6 +890,16 @@ namespace llvm {
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
/// This method returns a target specific FastISel object,
/// or null if the target does not support "fast" ISel.
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -890,6 +911,11 @@ namespace llvm {
bool getStackCookieLocation(unsigned &AddressSpace,
unsigned &Offset) const override;
+ /// Return true if the target stores SafeStack pointer at a fixed offset in
+ /// some non-standard address space, and populates the address space and
+ /// offset as appropriate.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
SelectionDAG &DAG) const;
@@ -899,6 +925,8 @@ namespace llvm {
/// \brief Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+ bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -908,7 +936,6 @@ namespace llvm {
/// Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- const DataLayout *TD;
/// Select between SSE or x87 floating point ops.
/// When SSE is available, use it for f32 operations.
@@ -955,7 +982,6 @@ namespace llvm {
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG& DAG) const;
- bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
SDValue Chain, bool IsTailCall, bool Is64Bit,
int FPDiff, SDLoc dl) const;
@@ -969,7 +995,6 @@ namespace llvm {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
@@ -994,9 +1019,9 @@ namespace llvm {
SDValue LowerToBT(SDValue And, ISD::CondCode CC,
SDLoc dl, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
@@ -1042,27 +1067,16 @@ namespace llvm {
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
- bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- TargetLoweringBase::AtomicRMWExpansionKind
+ TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
- bool needsCmpXchgNb(const Type *MemType) const;
-
- /// Utility function to emit atomic-load-arith operations (and, or, xor,
- /// nand, max, min, umax, umin). It takes the corresponding instruction to
- /// expand, the associated machine basic block, and the associated X86
- /// opcodes for reg/reg.
- MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
- MachineBasicBlock *MBB) const;
-
- /// Utility function to emit atomic-load-arith operations (and, or, xor,
- /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
- MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
- MachineBasicBlock *MBB) const;
+ bool needsCmpXchgNb(Type *MemType) const;
// Utility function to emit the low-level va_arg code for X86-64.
MachineBasicBlock *EmitVAARG64WithCustomInserter(
@@ -1077,18 +1091,24 @@ namespace llvm {
MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const;
@@ -1121,7 +1141,7 @@ namespace llvm {
unsigned &RefinementSteps) const override;
/// Reassociate floating point divisions into multiply by reciprocal.
- bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+ unsigned combineRepeatedFPDivisors() const override;
};
namespace X86 {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index faa91500b181..8bf2925a75db 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -79,7 +79,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
!if (!eq (TypeVariantName, "i"),
!if (!eq (Size, 128), "v2i64",
!if (!eq (Size, 256), "v4i64",
- !if (!eq (Size, 512),
+ !if (!eq (Size, 512),
!if (!eq (EltSize, 64), "v8i64", "v16i32"),
VTName))), VTName));
@@ -145,6 +145,8 @@ def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;
// We map scalar types to the smallest (128-bit) vector type
// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">;
+def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">;
def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;
@@ -274,6 +276,22 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+ X86VectorVTInfo InVT,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS> :
+ AVX512_maskable_common<O, F, OutVT, Outs,
+ !con((ins InVT.RC:$src1), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+ (vselect InVT.KRCWM:$mask, RHS,
+ (bitconvert InVT.RC:$src1))>;
+
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
@@ -471,84 +489,123 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
-
-multiclass vinsert_for_size_no_alt<int Opcode,
- X86VectorVTInfo From, X86VectorVTInfo To,
- PatFrag vinsert_insert,
- SDNodeXForm INSERT_get_vinsert_imm> {
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vinsert_insert> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
- def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst),
- (ins VR512:$src1, From.RC:$src2, u8imm:$src3),
- "vinsert" # From.EltTypeName # "x" # From.NumElts #
- "\t{$src3, $src2, $src1, $dst|"
- "$dst, $src1, $src2, $src3}",
- [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1),
- (From.VT From.RC:$src2),
- (iPTR imm)))]>,
- EVEX_4V, EVEX_V512;
+ defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
- let mayLoad = 1 in
- def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst),
- (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3),
- "vinsert" # From.EltTypeName # "x" # From.NumElts #
- "\t{$src3, $src2, $src1, $dst|"
- "$dst, $src1, $src2, $src3}",
- []>,
- EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>;
- }
-}
-
-multiclass vinsert_for_size<int Opcode,
- X86VectorVTInfo From, X86VectorVTInfo To,
- X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
- PatFrag vinsert_insert,
- SDNodeXForm INSERT_get_vinsert_imm> :
- vinsert_for_size_no_alt<Opcode, From, To,
- vinsert_insert, INSERT_get_vinsert_imm> {
- // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for
- // vinserti32x4. Only add this if 64x2 and friends are not supported
- // natively via AVX512DQ.
- let Predicates = [NoDQI] in
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+ (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+ "vinsert" # From.EltTypeName # "x" # From.NumElts,
+ "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (vinsert_insert:$src3 (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+ }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+ let Predicates = p in {
def : Pat<(vinsert_insert:$ins
- (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)),
- (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr")
- VR512:$src1, From.RC:$src2,
- (INSERT_get_vinsert_imm VR512:$ins)))>;
+ (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+ def : Pat<(vinsert_insert:$ins
+ (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rm")
+ To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins)))>;
+ }
}
multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
ValueType EltVT64, int Opcode256> {
- defm NAME # "32x4" : vinsert_for_size<Opcode128,
+
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vinsert128_insert>, EVEX_V256;
+
+ defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
X86VectorVTInfo<16, EltVT32, VR512>,
- X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vinsert128_insert>, EVEX_V512;
+
+ defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- vinsert128_insert,
- INSERT_get_vinsert128_imm>;
- let Predicates = [HasDQI] in
- defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128,
+ vinsert256_insert>, VEX_W, EVEX_V512;
+
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vinsert128_insert>, VEX_W, EVEX_V256;
+
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- vinsert128_insert,
- INSERT_get_vinsert128_imm>, VEX_W;
- defm NAME # "64x4" : vinsert_for_size<Opcode256,
- X86VectorVTInfo< 4, EltVT64, VR256X>,
- X86VectorVTInfo< 8, EltVT64, VR512>,
- X86VectorVTInfo< 8, EltVT32, VR256>,
- X86VectorVTInfo<16, EltVT32, VR512>,
- vinsert256_insert,
- INSERT_get_vinsert256_imm>, VEX_W;
- let Predicates = [HasDQI] in
- defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256,
- X86VectorVTInfo< 8, EltVT32, VR256X>,
- X86VectorVTInfo<16, EltVT32, VR512>,
- vinsert256_insert,
- INSERT_get_vinsert256_imm>;
+ vinsert128_insert>, VEX_W, EVEX_V512;
+
+ defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ vinsert256_insert>, EVEX_V512;
+ }
}
defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
// vinsertps - insert f32 to XMM
def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
@@ -566,90 +623,158 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
// AVX-512 VECTOR EXTRACT
//---
+multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From,
+ X86VectorVTInfo To> {
+ // A subvector extract from the first vector position is
+ // a subregister copy that needs no instruction.
+ def NAME # To.NumElts:
+ Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))),
+ (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>;
+}
+
multiclass vextract_for_size<int Opcode,
- X86VectorVTInfo From, X86VectorVTInfo To,
- X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
- PatFrag vextract_extract,
- SDNodeXForm EXTRACT_get_vextract_imm> {
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vextract_extract> :
+ vextract_for_size_first_position_lowering<From, To> {
+
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+ // vextract_extract), we interesting only in patterns without mask,
+ // intrinsics pattern match generated bellow.
defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
- (ins VR512:$src1, u8imm:$idx),
- "vextract" # To.EltTypeName # "x4",
+ (ins From.RC:$src1, i32u8imm:$idx),
+ "vextract" # To.EltTypeName # "x" # To.NumElts,
"$idx, $src1", "$src1, $idx",
- [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1),
+ [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
(iPTR imm)))]>,
- AVX512AIi8Base, EVEX, EVEX_V512;
- let mayStore = 1 in
- def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
- (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2),
- "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|"
- "$dst, $src1, $src2}",
- []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>;
- }
-
- // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for
- // vextracti32x4
- def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)),
- (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr")
- VR512:$src1,
- (EXTRACT_get_vextract_imm To.RC:$ext)))>;
-
- // A 128/256-bit subvector extract from the first 512-bit vector position is
- // a subregister copy that needs no instruction.
- def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))),
- (To.VT
- (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>;
-
- // And for the alternative types.
- def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))),
- (AltTo.VT
- (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>;
+ AVX512AIi8Base, EVEX;
+ let mayStore = 1 in {
+ def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>, EVEX;
+
+ def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+ (ins To.MemOp:$dst, To.KRCWM:$mask,
+ From.RC:$src1, i32u8imm:$src2),
+ "vextract" # To.EltTypeName # "x" # To.NumElts #
+ "\t{$src2, $src1, $dst {${mask}}|"
+ "$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K, EVEX;
+ }//mayStore = 1
+ }
// Intrinsic call with masking.
def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x4_512")
- VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask),
- (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0,
- (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
- VR512:$src1, imm:$idx)>;
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrk")
+ To.RC:$src0,
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
// Intrinsic call with zero-masking.
def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x4_512")
- VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask),
- (!cast<Instruction>(NAME # To.EltSize # "x4rrkz")
- (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
- VR512:$src1, imm:$idx)>;
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rrkz")
+ (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+ From.RC:$src1, imm:$idx)>;
// Intrinsic call without masking.
def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x4_512")
- VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
- (!cast<Instruction>(NAME # To.EltSize # "x4rr")
- VR512:$src1, imm:$idx)>;
+ "x" # To.NumElts # "_" # From.Size)
+ From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+ (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+ From.ZSuffix # "rr")
+ From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> :
+ vextract_for_size_first_position_lowering<From, To> {
+
+ let Predicates = p in
+ def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+ (To.VT (!cast<Instruction>(InstrStr#"rr")
+ From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
}
-multiclass vextract_for_type<ValueType EltVT32, int Opcode32,
- ValueType EltVT64, int Opcode64> {
- defm NAME # "32x4" : vextract_for_size<Opcode32,
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+ ValueType EltVT64, int Opcode256> {
+ defm NAME # "32x4Z" : vextract_for_size<Opcode128,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+ defm NAME # "64x4Z" : vextract_for_size<Opcode256,
X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vextract256_extract>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ let Predicates = [HasVLX] in
+ defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract>,
+ EVEX_V256, EVEX_CD8<32, CD8VT4>;
+ let Predicates = [HasVLX, HasDQI] in
+ defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- vextract128_extract,
- EXTRACT_get_vextract128_imm>;
- defm NAME # "64x4" : vextract_for_size<Opcode64,
+ vextract128_extract>,
+ VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+ let Predicates = [HasDQI] in {
+ defm NAME # "64x2Z" : vextract_for_size<Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
- X86VectorVTInfo< 4, EltVT64, VR256X>,
+ X86VectorVTInfo< 2, EltVT64, VR128X>,
+ vextract128_extract>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+ defm NAME # "32x8Z" : vextract_for_size<Opcode256,
X86VectorVTInfo<16, EltVT32, VR512>,
- X86VectorVTInfo< 8, EltVT32, VR256>,
- vextract256_extract,
- EXTRACT_get_vextract256_imm>, VEX_W;
+ X86VectorVTInfo< 8, EltVT32, VR256X>,
+ vextract256_extract>,
+ EVEX_V512, EVEX_CD8<32, CD8VT8>;
+ }
}
defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
// A 128-bit subvector insert to the first 512-bit vector position
// is a subregister copy that needs no instruction.
def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
@@ -677,6 +802,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)),
+ (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
// vextractps - extract 32 bits from XMM
def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
@@ -694,50 +823,49 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
//===---------------------------------------------------------------------===//
// AVX-512 BROADCAST
//---
-multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
- ValueType svt, X86VectorVTInfo _> {
- defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix),
- "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>,
- T8PD, EVEX;
- let mayLoad = 1 in {
- defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.ScalarMemOp:$src),
- "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src",
- (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>,
- T8PD, EVEX;
- }
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+ T8PD, EVEX;
+ let mayLoad = 1 in
+ defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (DestInfo.VT (X86VBroadcast
+ (SrcInfo.ScalarLdFrag addr:$src)))>,
+ T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
}
-multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode,
- AVX512VLVectorVTInfo _> {
- defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>,
+multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>,
- EVEX_V256;
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
}
}
let ExeDomain = SSEPackedSingle in {
- defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast,
- avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>;
+ defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss",
+ avx512vl_f32_info>;
let Predicates = [HasVLX] in {
- defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X,
- v4f32, v4f32x_info>, EVEX_V128,
- EVEX_CD8<32, CD8VT1>;
+ defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss",
+ v4f32x_info, v4f32x_info>, EVEX_V128;
}
}
let ExeDomain = SSEPackedDouble in {
- defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast,
- avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd",
+ avx512vl_f64_info>, VEX_W;
}
// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
-// Later, we can canonize broadcast instructions before ISel phase and
+// Later, we can canonize broadcast instructions before ISel phase and
// eliminate additional patterns on ISel.
// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
// representations of source
@@ -834,70 +962,50 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
(bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
(VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
-multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
- RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
- RegisterClass KRC> {
- def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst,
- (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
- def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
- VR128X:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"),
- []>, EVEX, EVEX_K;
- def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
- VR128X:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- []>, EVEX, EVEX_KZ;
- let mayLoad = 1 in {
- def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst,
- (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
- def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
- x86memop:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"),
- []>, EVEX, EVEX_K;
- def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
- x86memop:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- [(set DstRC:$dst, (OpVT (vselect KRC:$mask,
- (X86VBroadcast (ld_frag addr:$src)),
- (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ;
- }
-}
-
-defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
- loadi32, VR512, v16i32, v4i32, VK16WM>,
- EVEX_V512, EVEX_CD8<32, CD8VT1>;
-defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
- loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W,
- EVEX_CD8<64, CD8VT1>;
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo> {
+ def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+ (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+ (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+ EVEX_V512;
+ // Defined separately to avoid redefinition.
+ defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ EVEX_V128;
+ }
+}
+
+defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+ avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+ avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+ avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+ avx512vl_i64_info, HasAVX512>, VEX_W;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
- let mayLoad = 1 in {
- def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _Dst.RC:$dst,
- (_Dst.VT (X86SubVBroadcast
- (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX;
- def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask,
- _Src.MemOp:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
- []>, EVEX, EVEX_K;
- def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask,
- _Src.MemOp:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- []>, EVEX, EVEX_KZ;
- }
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (X86SubVBroadcast
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+ AVX5128IBase, EVEX;
}
defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
@@ -944,10 +1052,45 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
EVEX_V512, EVEX_CD8<32, CD8VT8>;
}
-def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
- (VPBROADCASTDZrr VR128X:$src)>;
-def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
- (VPBROADCASTQZrr VR128X:$src)>;
+multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Src,
+ SDNode OpNode = X86SubVBroadcast> {
+
+ defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+ T8PD, EVEX;
+ let mayLoad = 1 in
+ defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_Dst.VT (OpNode
+ (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>,
+ T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> {
+ let Predicates = [HasDQI] in
+ defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>,
+ EVEX_V512;
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>,
+ EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo _> :
+ avx512_common_broadcast_32x2<opc, OpcodeStr, _> {
+
+ let Predicates = [HasDQI, HasVLX] in
+ defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128,
+ X86SubV32x2Broadcast>, EVEX_V128;
+}
+
+defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+ avx512vl_i32_info>;
+defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+ avx512vl_f32_info>;
def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
(VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
@@ -959,21 +1102,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
(VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
-def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))),
- (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
-def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))),
- (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>;
-
-def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))),
- (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
-def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))),
- (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>;
-
-def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
- (VBROADCASTSSZr VR128X:$src)>;
-def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
- (VBROADCASTSDZr VR128X:$src)>;
-
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
@@ -985,170 +1113,178 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST MASK TO VECTOR REGISTER
//---
-
-multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
- RegisterClass KRC> {
-let Predicates = [HasCDI] in
-def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, EVEX, EVEX_V512;
-
-let Predicates = [HasCDI, HasVLX] in {
-def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, EVEX, EVEX_V128;
-def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src),
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass KRC> {
+ def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, EVEX, EVEX_V256;
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+ let Predicates = [HasCDI] in
+ defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+ let Predicates = [HasCDI, HasVLX] in {
+ defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+ defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+ }
}
-let Predicates = [HasCDI] in {
defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
- VK16>;
+ avx512vl_i32_info, VK16>;
defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
- VK8>, VEX_W;
-}
+ avx512vl_i64_info, VK8>, VEX_W;
//===----------------------------------------------------------------------===//
-// AVX-512 - VPERM
-//
-// -- immediate form --
-multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
- def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
- EVEX;
- def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst),
- (ins _.MemOp:$src1, u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (OpNode (_.LdFrag addr:$src1),
- (i8 imm:$src2))))]>,
- EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+ defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+ AVX5128IBase;
+
+ let mayLoad = 1 in
+ defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
+ (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+ EVEX_4V, AVX5128IBase;
+ }
}
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+ let mayLoad = 1, Constraints = "$src1 = $dst" in
+ defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
+ !strconcat("$src2, ${src3}", _.BroadcastStr ),
+ (_.VT (X86VPermi2X IdxVT.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+ AVX5128IBase, EVEX_4V, EVEX_B;
}
-multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
- X86VectorVTInfo Ctrl> :
- avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> {
- let ExeDomain = _.ExeDomain in {
- def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2),
- !strconcat("vpermil" # _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (X86VPermilpv _.RC:$src1,
- (Ctrl.VT Ctrl.RC:$src2))))]>,
- EVEX_4V;
- def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst),
- (ins _.RC:$src1, Ctrl.MemOp:$src2),
- !strconcat("vpermil" # _.Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,
- (_.VT (X86VPermilpv _.RC:$src1,
- (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>,
- EVEX_4V;
- }
-}
-defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>,
- EVEX_V512;
-defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>,
- EVEX_V512, VEX_W;
-
-def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
- (VPERMILPSZri VR512:$src1, imm:$imm)>;
-def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
- (VPERMILPDZri VR512:$src1, imm:$imm)>;
-
-// -- VPERM2I - 3 source operands form --
-multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _> {
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx> {
+ let Predicates = [HasBWI] in
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
+ let Predicates = [HasBWI, HasVLX] in {
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w",
+ avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let Constraints = "$src1 = $dst" in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.RC:$src3),
+ (ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
AVX5128IBase;
let mayLoad = 1 in
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.MemOp:$src3),
+ (ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2,
- (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+ (bitconvert (_.LdFrag addr:$src3))))>,
EVEX_4V, AVX5128IBase;
}
}
-multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _> {
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let mayLoad = 1, Constraints = "$src1 = $dst" in
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (_.VT (OpNode _.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+ (_.VT (X86VPermt2 _.RC:$src1,
+ IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
AVX5128IBase, EVEX_4V, EVEX_B;
}
-multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr,
- SDNode OpNode, AVX512VLVectorVTInfo VTInfo> {
- let Predicates = [HasAVX512] in
- defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo ShuffleMask> {
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+ ShuffleMask.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
- defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>,
- EVEX_V256;
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+ ShuffleMask.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>,
+ avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+ ShuffleMask.info256>, EVEX_V256;
}
}
-multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr,
- SDNode OpNode, AVX512VLVectorVTInfo VTInfo> {
+
+multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr,
+ AVX512VLVectorVTInfo VTInfo,
+ AVX512VLVectorVTInfo Idx> {
let Predicates = [HasBWI] in
- defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>,
- EVEX_V512;
+ defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ Idx.info512>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
- defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
- defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>,
- avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>,
- EVEX_V256;
- }
-}
-defm VPERMI2D : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3,
- avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3,
- avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3,
- avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3,
- avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2D : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3,
- avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3,
- avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3,
- avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3,
- avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2W : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3,
- avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMI2W : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3,
- avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ Idx.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ Idx.info256>, EVEX_V256;
+ }
+}
+
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d",
+ avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q",
+ avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w",
+ avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+ avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+ avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
@@ -1265,41 +1401,85 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
//===----------------------------------------------------------------------===//
// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
-multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
- SDNode OpNode, ValueType VT,
- PatFrag ld_frag, string Suffix> {
- def rr : AVX512Ii8<0xC2, MRMSrcReg,
- (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", Suffix,
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+ defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc)>, EVEX_4V;
+ let mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+ imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+ "vcmp${cc}"#_.Suffix,
+ "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ imm:$cc,
+ (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+ // Accept explicit immediate argument form instead of comparison code.
+ let isAsmParserOnly = 1, hasSideEffects = 0 in {
+ defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs VK1:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+ defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc, $src2, $src1", "$src1, $src2, $cc">,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+ defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+ (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ "vcmp"#_.Suffix,
+ "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">,
+ EVEX_4V, EVEX_B;
+ }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512Ii8<0xC2, MRMSrcReg,
+ (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ _.FRC:$src2,
+ imm:$cc))],
IIC_SSE_ALU_F32S_RR>, EVEX_4V;
- def rm : AVX512Ii8<0xC2, MRMSrcMem,
- (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
- !strconcat("vcmp${cc}", Suffix,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VK1:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
- let isAsmParserOnly = 1, hasSideEffects = 0 in {
- def rri_alt : AVX512Ii8<0xC2, MRMSrcReg,
- (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
- !strconcat("vcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
let mayLoad = 1 in
- def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem,
- (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
- !strconcat("vcmp", Suffix,
- "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ def rm : AVX512Ii8<0xC2, MRMSrcMem,
+ (outs _.KRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+ !strconcat("vcmp${cc}", _.Suffix,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+ (_.ScalarLdFrag addr:$src2),
+ imm:$cc))],
+ IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
}
}
let Predicates = [HasAVX512] in {
-defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">,
- XS;
-defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">,
- XD, VEX_W;
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XSIi8Base;
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+ AVX512XDIi8Base, VEX_W;
}
multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -1700,6 +1880,128 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
(v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
imm:$cc), VK8)>;
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction mask = op(reg_scalar,imm)
+// op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ let mayLoad = 1, AddedComplexity = 20 in {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ }
+ }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+// fpclass(reg_vec, mem_vec, imm)
+// fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, string mem, string broadcast>{
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix#
+ "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask,
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ let mayLoad = 1 in {
+ def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>;
+ def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##mem#
+ "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+ [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+ (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst | $dst, ${src1}"
+ ##_.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+ def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+ _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"##
+ _.BroadcastStr##", $src2}",
+ [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2))))], NoItinerary>,
+ EVEX_B, EVEX_K;
+ }
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
+ string broadcast>{
+ let Predicates = [prd] in {
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
+ broadcast>, EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+ broadcast>, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+ broadcast>, EVEX_V256;
+ }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+ bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+ defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
+ VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
+ VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+ defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+ defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+ f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+ X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
//-----------------------------------------------------------------
// Mask register copy, including
// - copy between mask registers
@@ -1786,6 +2088,11 @@ let Predicates = [HasDQI] in {
(KMOVBmk addr:$dst, VK8:$src)>;
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
(KMOVBkm addr:$src)>;
+
+ def : Pat<(store VK4:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+ def : Pat<(store VK2:$src, addr:$dst),
+ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
}
let Predicates = [HasAVX512, NoDQI] in {
def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
@@ -1837,10 +2144,15 @@ let Predicates = [HasAVX512] in {
(AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
def : Pat<(i32 (anyext VK1:$src)),
(KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
def : Pat<(i8 (zext VK1:$src)),
(EXTRACT_SUBREG
(AND32ri (KMOVWrk
(COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+ def : Pat<(i8 (anyext VK1:$src)),
+ (EXTRACT_SUBREG
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+
def : Pat<(i64 (zext VK1:$src)),
(AND64ri8 (SUBREG_TO_REG (i64 0),
(KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
@@ -1848,17 +2160,19 @@ let Predicates = [HasAVX512] in {
(EXTRACT_SUBREG
(AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
sub_16bit)>;
- def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK16)>;
- def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK8)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK32)>;
- def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
- (COPY_TO_REGCLASS VK1:$src, VK64)>;
}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
@@ -1955,11 +2269,12 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
}
multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, bit IsCommutable> {
+ SDPatternOperator OpNode, bit IsCommutable,
+ Predicate prdW = HasAVX512> {
defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS;
+ prdW, IsCommutable>, VEX_4V, VEX_L, PS;
defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
@@ -1974,6 +2289,7 @@ defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>;
defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
multiclass avx512_mask_binop_int<string IntName, string InstName> {
let Predicates = [HasAVX512] in
@@ -2047,59 +2363,48 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
(COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
// Mask unpacking
-multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
- RegisterClass KRC> {
- let Predicates = [HasAVX512] in
- def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-}
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+ RegisterClass KRCSrc, Predicate prd> {
+ let Predicates = [prd] in {
+ def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+ (ins KRC:$src1, KRC:$src2),
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ VEX_4V, VEX_L;
-multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
- defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>,
- VEX_4V, VEX_L, PD;
+ def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+ (!cast<Instruction>(NAME##rr)
+ (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+ (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+ }
}
-defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
-def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))),
- (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16),
- (COPY_TO_REGCLASS VK8:$src1, VK16))>;
-
-
-multiclass avx512_mask_unpck_int<string IntName, string InstName> {
- let Predicates = [HasAVX512] in
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
- (i16 GR16:$src1), (i16 GR16:$src2)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
- (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
- (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">;
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
// Mask bit testing
multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
- SDNode OpNode> {
- let Predicates = [HasAVX512], Defs = [EFLAGS] in
+ SDNode OpNode, Predicate prd> {
+ let Predicates = [prd], Defs = [EFLAGS] in
def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
}
-multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
- VEX, PS;
- let Predicates = [HasDQI] in
- defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
- VEX, PD;
- let Predicates = [HasBWI] in {
- defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
- VEX, PS, VEX_W;
- defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
- VEX, PD, VEX_W;
- }
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+ VEX, PD;
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+ VEX, PS;
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+ VEX, PS, VEX_W;
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+ VEX, PD, VEX_W;
}
defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
// Mask shift
multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -2124,7 +2429,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
let Predicates = [HasDQI] in
defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
VEX, TAPD;
- }
+ }
}
defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
@@ -2167,24 +2472,52 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
(v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+ (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+ (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+
def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
(v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
(v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
-let Predicates = [HasVLX] in {
- def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
- (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
- def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
- (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
- def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
- (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
- def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
- (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
- def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
- (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
-}
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+ (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+
+def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+ (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
+
+def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+ (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
+
+def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
+def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+ (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
+
+def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+ (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>;
+
+def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))),
+ (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>;
+
def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
(v8i1 (COPY_TO_REGCLASS
@@ -2304,23 +2637,21 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
PatFrag st_frag, PatFrag mstore> {
- let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
- def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
- OpcodeStr # "\t{$src, $dst|$dst, $src}", [],
- _.ExeDomain>, EVEX;
- let Constraints = "$src1 = $dst" in
- def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
- (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2),
- OpcodeStr #
- "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}",
+
+ def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+ OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+ [], _.ExeDomain>, EVEX;
+ def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src),
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+ "${dst} {${mask}}, $src}",
[], _.ExeDomain>, EVEX, EVEX_K;
- def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
+ def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
- OpcodeStr #
- "\t{$src, ${dst} {${mask}} {z}|" #
+ OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
"${dst} {${mask}} {z}, $src}",
[], _.ExeDomain>, EVEX, EVEX_KZ;
- }
+
let mayStore = 1 in {
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -2425,22 +2756,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
(VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
VR512:$src)>;
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
- (VMOVUPSZmrk addr:$ptr,
- (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
- (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz
- (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
- (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
- (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
-
defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
HasAVX512>,
avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2502,17 +2817,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
(v16i32 VR512:$src))),
(VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
}
-// NoVLX patterns
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
- (VMOVDQU32Zmrk addr:$ptr,
- (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
- (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz
- (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
// Move Int Doubleword to Packed Double Int
//
@@ -2520,32 +2824,37 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
- EVEX, VEX_LIG;
+ EVEX;
def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
- IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector GR64:$src)))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+ IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}", []>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
let isCodeGenOnly = 1 in {
-def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (bitconvert GR64:$src))],
+ [(set FR64X:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
-def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (bitconvert FR64:$src))],
+ [(set GR64:$dst, (bitconvert FR64X:$src))],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
-}
-def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+ [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
EVEX_CD8<64, CD8VT1>;
+}
// Move Int Doubleword to Single Scalar
//
@@ -2553,27 +2862,27 @@ let isCodeGenOnly = 1 in {
def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))],
- IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+ IIC_SSE_MOVDQ>, EVEX;
def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
}
// Move doubleword from xmm register to r/m32
//
def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
- EVEX, VEX_LIG;
+ EVEX;
def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (vector_extract (v4i32 VR128X:$src),
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ EVEX, EVEX_CD8<32, CD8VT1>;
// Move quadword from xmm1 register to r/m64
//
@@ -2581,16 +2890,28 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
(iPTR 0)))],
- IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W,
+ IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
Requires<[HasAVX512, In64BitMode]>;
-def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
- (ins i64mem:$dst, VR128X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
- addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>,
- Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+ addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+ Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src),
+ "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+ EVEX, VEX_W;
// Move Scalar Single to Double Int
//
@@ -2599,92 +2920,95 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
(ins FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))],
- IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+ IIC_SSE_MOVD_ToGP>, EVEX;
def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
}
// Move Quadword Int to Packed Quadword Int
//
-def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
//===----------------------------------------------------------------------===//
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar <string asm, RegisterClass RC,
- SDNode OpNode, ValueType vt,
- X86MemOperand x86memop, PatFrag mem_pat> {
- let hasSideEffects = 0 in {
- def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
- (scalar_to_vector RC:$src2))))],
- IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
- let Constraints = "$src1 = $dst" in
- def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
- !strconcat(asm,
- "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
- [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
- def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
- EVEX, VEX_LIG;
+multiclass avx512_move_scalar <string asm, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2),
+ asm, "$src2, $src1","$src1, $src2",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2))),
+ IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let Constraints = "$src1 = $dst" , mayLoad = 1 in
+ defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+ (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src),
+ asm,"$src","$src",
+ (_.VT (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>, EVEX;
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.FRC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+ (scalar_to_vector _.FRC:$src2))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ let mayLoad = 1 in
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ }
let mayStore = 1 in {
- def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
- EVEX, VEX_LIG;
- def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
- !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
- [], IIC_SSE_MOV_S_MR>,
- EVEX, VEX_LIG, EVEX_K;
+ def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
+ EVEX;
+ def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
} // mayStore
- } //hasSideEffects = 0
}
-let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
- loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+ VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
-let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
- loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
- (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
- VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+ (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
- (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
- VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+ (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
(VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
(COPY_TO_REGCLASS VR128X:$src, FR32X))>;
-// For the disassembler
-let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
- def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src1, FR32X:$src2),
- "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_MOV_S_RR>,
- XS, EVEX_4V, VEX_LIG;
- def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src1, FR64X:$src2),
- "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_MOV_S_RR>,
- XD, EVEX_4V, VEX_LIG, VEX_W;
-}
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+ XS, EVEX_4V, VEX_LIG;
+
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+ (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+ "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+ XD, EVEX_4V, VEX_LIG, VEX_W;
let Predicates = [HasAVX512] in {
let AddedComplexity = 15 in {
@@ -2768,10 +3092,10 @@ let Predicates = [HasAVX512] in {
(EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
// Extract and store.
- def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+ def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
- def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+ def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
@@ -2835,7 +3159,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(v2i64 VR128X:$src))))],
IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
-let AddedComplexity = 20 in
+let AddedComplexity = 20 , isCodeGenOnly = 1 in
def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i128mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
@@ -2964,7 +3288,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, OpndItins itins,
bit IsCommutable = 0> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
itins.rr, IsCommutable>,
@@ -2972,7 +3296,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
let mayLoad = 1 in
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)))),
@@ -2986,7 +3310,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
let mayLoad = 1 in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
@@ -3058,20 +3382,20 @@ multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
SDNode OpNode, OpndItins itins, Predicate prd,
bit IsCommutable = 0> {
- defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd,
+ defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
IsCommutable>;
- defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd,
+ defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
IsCommutable>;
}
multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
SDNode OpNode, OpndItins itins, Predicate prd,
bit IsCommutable = 0> {
- defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd,
+ defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
IsCommutable>;
- defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd,
+ defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
IsCommutable>;
}
@@ -3086,15 +3410,15 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
}
multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
- SDNode OpNode,X86VectorVTInfo _Src,
+ SDNode OpNode,X86VectorVTInfo _Src,
X86VectorVTInfo _Dst, bit IsCommutable = 0> {
- defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
- "$src2, $src1","$src1, $src2",
- (_Dst.VT (OpNode
- (_Src.VT _Src.RC:$src1),
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- itins.rr, IsCommutable>,
+ itins.rr, IsCommutable>,
AVX512BIBase, EVEX_4V;
let mayLoad = 1 in {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -3106,12 +3430,12 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512BIBase, EVEX_4V;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
- (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
+ (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
OpcodeStr,
"${src2}"##_Dst.BroadcastStr##", $src1",
"$src1, ${src2}"##_Dst.BroadcastStr,
- (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Dst.VT (X86VBroadcast
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Dst.VT (X86VBroadcast
(_Dst.ScalarLdFrag addr:$src2)))))),
itins.rm>,
AVX512BIBase, EVEX_4V, EVEX_B;
@@ -3127,24 +3451,24 @@ defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
SSE_INTALU_ITINS_P, HasBWI, 0>;
defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
- SSE_INTALU_ITINS_P, HasBWI, 0>;
-defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul,
- SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
- SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
-defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P,
- HasBWI, 1>;
-defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P,
+ SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+ SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+ SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
HasBWI, 1>;
-defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P,
- HasBWI, 1>, T8PD;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+ HasBWI, 1>, T8PD;
defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
- SSE_INTALU_ITINS_P, HasBWI, 1>;
-
+ SSE_INTALU_ITINS_P, HasBWI, 1>;
+
multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
SDNode OpNode, bit IsCommutable = 0> {
@@ -3159,7 +3483,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
v4i32x_info, v2i64x_info, IsCommutable>,
EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
}
-}
+}
defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
X86pmuldq, 1>,T8PD;
@@ -3170,25 +3494,25 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
let mayLoad = 1 in {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
- (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+ (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
OpcodeStr,
"${src2}"##_Src.BroadcastStr##", $src1",
"$src1, ${src2}"##_Src.BroadcastStr,
- (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Src.VT (X86VBroadcast
+ (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+ (_Src.VT (X86VBroadcast
(_Src.ScalarLdFrag addr:$src2))))))>,
EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
}
}
-multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
- SDNode OpNode,X86VectorVTInfo _Src,
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,X86VectorVTInfo _Src,
X86VectorVTInfo _Dst> {
- defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
- "$src2, $src1","$src1, $src2",
- (_Dst.VT (OpNode
- (_Src.VT _Src.RC:$src1),
+ "$src2, $src1","$src1, $src2",
+ (_Dst.VT (OpNode
+ (_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2)))>,
EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
let mayLoad = 1 in {
@@ -3229,102 +3553,59 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
v16i8x_info>, EVEX_V128;
}
}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, AVX512VLVectorVTInfo _Src,
+ AVX512VLVectorVTInfo _Dst> {
+ defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+ _Dst.info512>, EVEX_V512;
+ let Predicates = [HasVLX] in {
+ defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+ _Dst.info256>, EVEX_V256;
+ defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+ _Dst.info128>, EVEX_V128;
+ }
+}
+
let Predicates = [HasBWI] in {
defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
+
+ defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+ avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+ defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+ avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
}
-defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax,
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax,
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
SSE_INTALU_ITINS_P, HasBWI, 1>;
defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax,
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax,
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin,
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin,
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
SSE_INTALU_ITINS_P, HasBWI, 1>;
defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin,
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin,
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
-
-//===----------------------------------------------------------------------===//
-// AVX-512 - Unpack Instructions
-//===----------------------------------------------------------------------===//
-
-multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
- PatFrag mem_frag, RegisterClass RC,
- X86MemOperand x86memop, string asm,
- Domain d> {
- def rr : AVX512PI<opc, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2),
- asm, [(set RC:$dst,
- (vt (OpNode RC:$src1, RC:$src2)))],
- d>, EVEX_4V;
- def rm : AVX512PI<opc, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
- asm, [(set RC:$dst,
- (vt (OpNode RC:$src1,
- (bitconvert (mem_frag addr:$src2)))))],
- d>, EVEX_4V;
-}
-
-defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64,
- VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64,
- VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64,
- VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64,
- VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop> {
- def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
- IIC_SSE_UNPCK>, EVEX_4V;
- def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
- (bitconvert (memop_frag addr:$src2)))))],
- IIC_SSE_UNPCK>, EVEX_4V;
-}
-defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
- VR512, loadv16i32, i512mem>, EVEX_V512,
- EVEX_CD8<32, CD8VF>;
-defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
- VR512, loadv8i64, i512mem>, EVEX_V512,
- VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
- VR512, loadv16i32, i512mem>, EVEX_V512,
- EVEX_CD8<32, CD8VF>;
-defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
- VR512, loadv8i64, i512mem>, EVEX_V512,
- VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
//===----------------------------------------------------------------------===//
@@ -3362,12 +3643,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
let isCodeGenOnly = 1, isCommutable = IsCommutable,
Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
- (ins _.FRC:$src1, _.FRC:$src2),
+ (ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
itins.rr>;
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
- (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))], itins.rr>;
@@ -3375,7 +3656,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
}
multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode VecNode, OpndItins itins, bit IsCommutable> {
+ SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
@@ -3470,7 +3751,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
EVEX_4V, EVEX_B;
}
-multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
bit IsCommutable = 0> {
defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
IsCommutable>, EVEX_V512, PS,
@@ -3514,7 +3795,7 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
@@ -3550,13 +3831,34 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
}//let mayLoad = 1
}
-multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> {
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+ defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>,
+ EVEX_4V,EVEX_CD8<32, CD8VT1>;
+ defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>,
+ EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
@@ -3569,7 +3871,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD;
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
@@ -3586,7 +3888,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
+ (OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))))>,
EVEX_4V,
EVEX_CD8<_.EltSize, CD8VF>;
@@ -3748,12 +4050,12 @@ multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
VTInfo.info256>, EVEX_V256;
defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
VTInfo.info128>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
VTInfo.info128>, EVEX_V128;
}
}
-multiclass avx512_shift_rmi_w<bits<8> opcw,
+multiclass avx512_shift_rmi_w<bits<8> opcw,
Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode> {
let Predicates = [HasBWI] in
@@ -3846,6 +4148,27 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
avx512vl_i64_info>, VEX_W;
}
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+ let Predicates = [HasBWI, NoVLX] in {
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
+ (_.info256.VT _.info256.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
+ (_.info128.VT _.info128.RC:$src2))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME#"WZrr")
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+ }
+}
+
multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasBWI] in
@@ -3861,11 +4184,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
}
defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
- avx512_var_shift_w<0x12, "vpsllvw", shl>;
+ avx512_var_shift_w<0x12, "vpsllvw", shl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
- avx512_var_shift_w<0x11, "vpsravw", sra>;
+ avx512_var_shift_w<0x11, "vpsravw", sra>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
- avx512_var_shift_w<0x10, "vpsrlvw", srl>;
+ avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+ avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
@@ -3916,19 +4242,77 @@ defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
X86VPermi, avx512vl_f64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+ defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode _.RC:$src1,
+ (Ctrl.VT Ctrl.RC:$src2)))>,
+ T8PD, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.VT (OpNode
+ _.RC:$src1,
+ (Ctrl.VT (X86VBroadcast
+ (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ }//let mayLoad = 1
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+ Ctrl.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+ Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+ Ctrl.info256>, EVEX_V256;
+ }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+ AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+ defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+ X86VPermilpi, _>,
+ EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+ avx512vl_i32_info>;
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+ avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
//===----------------------------------------------------------------------===//
defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
- X86PShufd, avx512vl_i32_info>,
+ X86PShufd, avx512vl_i32_info>,
EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
- X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W;
+ X86PShufhw>, EVEX, AVX512XSIi8Base;
defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
- X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W;
-
+ X86PShuflw>, EVEX, AVX512XDIi8Base;
+
multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
let Predicates = [HasBWI] in
defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
@@ -3942,55 +4326,6 @@ multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
//===----------------------------------------------------------------------===//
-// AVX-512 - MOVDDUP
-//===----------------------------------------------------------------------===//
-
-multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT,
- X86MemOperand x86memop, PatFrag memop_frag> {
-def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
-def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst,
- (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
-}
-
-defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>,
- VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
- (VMOVDDUPZrm addr:$src)>;
-
-//===---------------------------------------------------------------------===//
-// Replicate Single FP - MOVSHDUP and MOVSLDUP
-//===---------------------------------------------------------------------===//
-multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
- ValueType vt, RegisterClass RC, PatFrag mem_frag,
- X86MemOperand x86memop> {
- def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
- let mayLoad = 1 in
- def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
-}
-
-defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
- v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
- EVEX_CD8<32, CD8VF>;
-defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
- v16f32, VR512, loadv16f32, f512mem>, EVEX_V512,
- EVEX_CD8<32, CD8VF>;
-
-def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))),
- (VMOVSHDUPZrm addr:$src)>;
-def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
-def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))),
- (VMOVSLDUPZrm addr:$src)>;
-
-//===----------------------------------------------------------------------===//
// Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
@@ -4017,6 +4352,115 @@ let Predicates = [HasAVX512] in {
}
//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayLoad = 1 in
+ def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1,
+ (_.VT (bitconvert
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+ v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+ v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+ // VMOVHPS patterns
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128X:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVHPD patterns
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPS patterns
+ def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+ (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+let mayStore = 1 in {
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+ (bc_v2f64 (v4f32 VR128X:$src))),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+ (ins f64mem:$dst, VR128X:$src),
+ "vmovlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128X:$src),
+ (iPTR 0))), addr:$dst)],
+ IIC_SSE_MOV_LH>,
+ EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+let Predicates = [HasAVX512] in {
+ // VMOVHPD patterns
+ def : Pat<(store (f64 (vector_extract
+ (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+ // VMOVLPS patterns
+ def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v4i32 (X86Movlps
+ (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+ (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+ // VMOVLPD patterns
+ def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+ def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+ addr:$src1),
+ (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
// FMA - Fused Multiply Operations
//
@@ -4034,7 +4478,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
- AVX512FMA3Base;
+ AVX512FMA3Base;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -4435,50 +4879,55 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
//===----------------------------------------------------------------------===//
// AVX-512 Scalar convert from float/double to integer
//===----------------------------------------------------------------------===//
-multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
- string asm> {
-let hasSideEffects = 0 in {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG,
- Requires<[HasAVX512]>;
- let mayLoad = 1 in
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
- Requires<[HasAVX512]>;
-} // hasSideEffects = 0
+multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC,
+ RegisterClass DstRC, Intrinsic Int,
+ Operand memop, ComplexPattern mem_cpat, string asm> {
+ let hasSideEffects = 0, Predicates = [HasAVX512] in {
+ def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+ def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+ !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>,
+ EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+ let mayLoad = 1 in
+ def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+ } // hasSideEffects = 0, Predicates = [HasAVX512]
}
-let Predicates = [HasAVX512] in {
+
// Convert float/double to signed/unsigned int 32/64
-defm VCVTSS2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
ssmem, sse_load_f32, "cvtss2si">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64,
+ int_x86_sse_cvtss2si64,
ssmem, sse_load_f32, "cvtss2si">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32,
+ int_x86_avx512_cvtss2usi,
ssmem, sse_load_f32, "cvtss2usi">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
int_x86_avx512_cvtss2usi64, ssmem,
sse_load_f32, "cvtss2usi">, XS, VEX_W,
EVEX_CD8<32, CD8VT1>;
-defm VCVTSD2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
sdmem, sse_load_f64, "cvtsd2si">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64,
+ int_x86_sse2_cvtsd2si64,
sdmem, sse_load_f64, "cvtsd2si">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32,
+ int_x86_avx512_cvtsd2usi,
sdmem, sse_load_f64, "cvtsd2usi">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
int_x86_avx512_cvtsd2usi64, sdmem,
sse_load_f64, "cvtsd2usi">, XD, VEX_W,
EVEX_CD8<64, CD8VT1>;
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
SSE_CVT_Scalar, 0>, XS, EVEX_4V;
@@ -4495,121 +4944,170 @@ let isCodeGenOnly = 1 in {
defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
SSE_CVT_Scalar, 0>, XD, EVEX_4V;
-} // isCodeGenOnly = 1
+} // isCodeGenOnly = 1, Predicates = [HasAVX512]
// Convert float/double to signed/unsigned int 32/64 with truncation
-let isCodeGenOnly = 1 in {
- defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
- ssmem, sse_load_f32, "cvttss2si">,
- XS, EVEX_CD8<32, CD8VT1>;
- defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
- int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si">, XS, VEX_W,
- EVEX_CD8<32, CD8VT1>;
- defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
- sdmem, sse_load_f64, "cvttsd2si">, XD,
- EVEX_CD8<64, CD8VT1>;
- defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
- int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si">, XD, VEX_W,
- EVEX_CD8<64, CD8VT1>;
- defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
- int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
- "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>;
- defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
- int_x86_avx512_cvttss2usi64, ssmem,
- sse_load_f32, "cvttss2usi">, XS, VEX_W,
- EVEX_CD8<32, CD8VT1>;
- defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
- int_x86_avx512_cvttsd2usi,
- sdmem, sse_load_f64, "cvttsd2usi">, XD,
- EVEX_CD8<64, CD8VT1>;
- defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
- int_x86_avx512_cvttsd2usi64, sdmem,
- sse_load_f64, "cvttsd2usi">, XD, VEX_W,
- EVEX_CD8<64, CD8VT1>;
-} // isCodeGenOnly = 1
-
-multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
- string asm> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+ X86VectorVTInfo _DstRC, SDNode OpNode,
+ SDNode OpNodeRnd>{
+let Predicates = [HasAVX512] in {
+ def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+ def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ []>, EVEX, EVEX_B;
+ def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
-}
-
-defm VCVTTSS2SIZ : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
- loadf32, "cvttss2si">, XS,
- EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USIZ : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
- loadf32, "cvttss2usi">, XS,
- EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2SI64Z : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
- loadf32, "cvttss2si">, XS, VEX_W,
- EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
- loadf32, "cvttss2usi">, XS, VEX_W,
- EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2SIZ : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
- loadf64, "cvttsd2si">, XD,
- EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USIZ : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
- loadf64, "cvttsd2usi">, XD,
- EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2SI64Z : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
- loadf64, "cvttsd2si">, XD, VEX_W,
- EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
- loadf64, "cvttsd2usi">, XD, VEX_W,
- EVEX_CD8<64, CD8VT1>;
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+ EVEX;
+
+ let isCodeGenOnly = 1,hasSideEffects = 0 in {
+ def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+ def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+ (i32 FROUND_NO_EXC)))]>,
+ EVEX,VEX_LIG , EVEX_B;
+ let mayLoad = 1 in
+ def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.MemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, VEX_LIG;
+
+ } // isCodeGenOnly = 1, hasSideEffects = 0
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info,
+ fp_to_sint,X86cvttss2IntRnd>,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info,
+ fp_to_sint,X86cvttss2IntRnd>,
+ VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info,
+ fp_to_sint,X86cvttsd2IntRnd>,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info,
+ fp_to_sint,X86cvttsd2IntRnd>,
+ VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info,
+ fp_to_uint,X86cvttss2UIntRnd>,
+ XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info,
+ fp_to_uint,X86cvttss2UIntRnd>,
+ XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint,X86cvttsd2UIntRnd>,
+ XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint,X86cvttsd2UIntRnd>,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+ def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+ (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+ (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+ (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+ def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+ (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+
} // HasAVX512
//===----------------------------------------------------------------------===//
// AVX-512 Convert form float to double and back
//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in {
-def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
- (ins FR32X:$src1, FR32X:$src2),
- "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
-let mayLoad = 1 in
-def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
- (ins FR32X:$src1, f32mem:$src2),
- "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
- EVEX_CD8<32, CD8VT1>;
-
-// Convert scalar double to scalar single
-def VCVTSD2SSZrr : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
- (ins FR64X:$src1, FR64X:$src2),
- "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
-let mayLoad = 1 in
-def VCVTSD2SSZrm : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
- (ins FR64X:$src1, f64mem:$src2),
- "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, EVEX_4V, VEX_LIG, VEX_W,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
-}
-
-def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
- Requires<[HasAVX512]>;
-def : Pat<(fextend (loadf32 addr:$src)),
- (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
-
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNode> {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+ (_Src.VT (scalar_to_vector
+ (_Src.ScalarLdFrag addr:$src2)))))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+ "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2),
+ (i32 FROUND_NO_EXC)))>,
+ EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+ X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1),
+ (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+ EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+ EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+ OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>,
+ EVEX_V512, XD;
+ }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, X86VectorVTInfo _src,
+ X86VectorVTInfo _dst> {
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
+ }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
+ X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext,
+ X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fextend FR32X:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+ Requires<[HasAVX512]>;
+def : Pat<(f64 (fextend (loadf32 addr:$src))),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
Requires<[HasAVX512, OptForSize]>;
-def : Pat<(extloadf32 addr:$src),
- (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
- Requires<[HasAVX512, OptForSpeed]>;
+def : Pat<(f64 (extloadf32 addr:$src)),
+ (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+ Requires<[HasAVX512, OptForSpeed]>;
-def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+def : Pat<(f32 (fround FR64X:$src)),
+ (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
Requires<[HasAVX512]>;
-
//===----------------------------------------------------------------------===//
// AVX-512 Vector convert from signed/unsigned integer to float/double
// and from float/double to signed/unsigned integer
@@ -4992,7 +5490,7 @@ defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
-let Predicates = [NoVLX] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
(v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -5024,40 +5522,102 @@ let Predicates = [HasAVX512] in {
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC,
- X86MemOperand x86memop> {
- def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
- "vcvtph2ps\t{$src, $dst|$dst, $src}",
- []>, EVEX;
- let hasSideEffects = 0, mayLoad = 1 in
- def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
- "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
-}
-
-multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC,
- X86MemOperand x86memop> {
- def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
- (ins srcRC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, EVEX;
- let hasSideEffects = 0, mayStore = 1 in
- def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, PatFrag ld_frag> {
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_CURRENT))>, T8PD;
+ let hasSideEffects = 0, mayLoad = 1 in {
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+ "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+ (i32 FROUND_CURRENT))>, T8PD;
+ }
}
-defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512,
- EVEX_CD8<32, CD8VH>;
-defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512,
- EVEX_CD8<32, CD8VH>;
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+ "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+ (X86cvtph2ps (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
-def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src),
- imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))),
- (VCVTPS2PHZrr VR512:$src, imm:$rc)>;
+}
-def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src),
- (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))),
- (VCVTPH2PSZrr VR256X:$src)>;
+let Predicates = [HasAVX512] in {
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop> {
+ defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, $src1", "$src1, $src2",
+ (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>, AVX512AIi8Base;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+ addr:$dst)]>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K;
+ }
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+ defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}",
+ (X86cvtps2ph (_src.VT _src.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+ defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+ EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ let Predicates = [HasVLX] in {
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ }
+}
+
+// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+ string OpcodeStr> {
+ def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+ [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2,
+ (i32 FROUND_NO_EXC)))],
+ IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+ Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+ AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+ AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
@@ -5067,10 +5627,10 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
"ucomisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
let Pattern = []<dag> in {
- defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+ defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
"comiss">, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+ defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
"comisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
@@ -5092,50 +5652,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
}
/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
-multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop> {
- let hasSideEffects = 0 in {
- def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in {
+ defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
let mayLoad = 1 in {
- def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+ defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
}
}
}
-defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>,
- EVEX_CD8<32, CD8VT1>;
-defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>,
- VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>,
- EVEX_CD8<32, CD8VT1>;
-defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>,
- VEX_W, EVEX_CD8<64, CD8VT1>;
-
-def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1),
- (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
- (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
- (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
-
-def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1),
- (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
- (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
- (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
-
-def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1),
- (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
- (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
- (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
-
-def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1),
- (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
- (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
- (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5183,20 +5724,6 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
-def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src),
- (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
- (VRSQRT14PSZr VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src),
- (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
- (VRSQRT14PDZr VR512:$src)>;
-
-def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),
- (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
- (VRCP14PSZr VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
- (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
- (VRCP14PDZr VR512:$src)>;
-
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode> {
@@ -5232,6 +5759,8 @@ let hasSideEffects = 0, Predicates = [HasERI] in {
defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
}
+
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -5322,67 +5851,6 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
}
}
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
- Intrinsic F32Int, Intrinsic F64Int,
- OpndItins itins_s, OpndItins itins_d> {
- def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
- (ins FR32X:$src1, FR32X:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins_s.rr>, XS, EVEX_4V;
- let isCodeGenOnly = 1 in
- def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VR128X:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F32Int VR128X:$src1, VR128X:$src2))],
- itins_s.rr>, XS, EVEX_4V;
- let mayLoad = 1 in {
- def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
- (ins FR32X:$src1, f32mem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
- let isCodeGenOnly = 1 in
- def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
- (ins VR128X:$src1, ssmem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F32Int VR128X:$src1, sse_load_f32:$src2))],
- itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
- }
- def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
- (ins FR64X:$src1, FR64X:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- XD, EVEX_4V, VEX_W;
- let isCodeGenOnly = 1 in
- def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
- (ins VR128X:$src1, VR128X:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F64Int VR128X:$src1, VR128X:$src2))],
- itins_s.rr>, XD, EVEX_4V, VEX_W;
- let mayLoad = 1 in {
- def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
- (ins FR64X:$src1, f64mem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
- let isCodeGenOnly = 1 in
- def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
- (ins VR128X:$src1, sdmem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128X:$dst,
- (F64Int VR128X:$src1, sse_load_f64:$src2))]>,
- XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
- }
-}
-
multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
@@ -5416,93 +5884,77 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT))>;
+
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+ "$rc, $src2, $src1", "$src1, $src2, $rc",
+ (OpNodeRnd (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$rc))>,
+ EVEX_B, EVEX_RC;
+
+ let isCodeGenOnly = 1 in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+ }
+
+ def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+ (!cast<Instruction>(NAME#SUFF#Zr)
+ (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+ def : Pat<(_.EltVT (OpNode (load addr:$src))),
+ (!cast<Instruction>(NAME#SUFF#Zm)
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+ X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
-defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt",
- int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd,
- SSE_SQRTSS, SSE_SQRTSD>;
+defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
let Predicates = [HasAVX512] in {
- def : Pat<(f32 (fsqrt FR32X:$src)),
- (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
- def : Pat<(f32 (fsqrt (load addr:$src))),
- (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[OptForSize]>;
- def : Pat<(f64 (fsqrt FR64X:$src)),
- (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
- def : Pat<(f64 (fsqrt (load addr:$src))),
- (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
- Requires<[OptForSize]>;
-
def : Pat<(f32 (X86frsqrt FR32X:$src)),
- (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+ (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
def : Pat<(f32 (X86frsqrt (load addr:$src))),
- (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
Requires<[OptForSize]>;
-
def : Pat<(f32 (X86frcp FR32X:$src)),
- (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+ (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
def : Pat<(f32 (X86frcp (load addr:$src))),
- (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+ (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
Requires<[OptForSize]>;
-
- def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
- (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS VR128X:$src, FR32)),
- VR128X)>;
- def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
- (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
- def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
- (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
- (COPY_TO_REGCLASS VR128X:$src, FR64)),
- VR128X)>;
- def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
- (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
-}
-
-
-multiclass avx512_rndscale<bits<8> opc, string OpcodeStr,
- X86MemOperand x86memop, RegisterClass RC,
- PatFrag mem_frag, Domain d> {
-let ExeDomain = d in {
- // Intrinsic operation, reg.
- // Vector intrinsic operation, reg
- def r : AVX512AIi8<opc, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, EVEX;
-
- // Vector intrinsic operation, mem
- def m : AVX512AIi8<opc, MRMSrcMem,
- (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, EVEX;
-} // ExeDomain
}
-defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
- loadv16f32, SSEPackedSingle>, EVEX_V512,
- EVEX_CD8<32, CD8VF>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
- imm:$src2, (v16f32 VR512:$src1), (i16 -1),
- FROUND_CURRENT)),
- (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
-
-
-defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
- loadv8f64, SSEPackedDouble>, EVEX_V512,
- VEX_W, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
- imm:$src2, (v8f64 VR512:$src1), (i8 -1),
- FROUND_CURRENT)),
- (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
-
multiclass
avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
@@ -5510,20 +5962,20 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 imm:$src3), (i32 FROUND_CURRENT)))>;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
- "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}",
- (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+ (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
let mayLoad = 1 in
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86RndScale (_.VT _.RC:$src1),
+ (_.VT (X86RndScales (_.VT _.RC:$src1),
(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
(i32 imm:$src3), (i32 FROUND_CURRENT)))>;
}
@@ -5568,109 +6020,238 @@ defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
-let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (ffloor VR512:$src)),
- (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
-def : Pat<(v16f32 (fnearbyint VR512:$src)),
- (VRNDSCALEPSZr VR512:$src, (i32 0xC))>;
-def : Pat<(v16f32 (fceil VR512:$src)),
- (VRNDSCALEPSZr VR512:$src, (i32 0x2))>;
-def : Pat<(v16f32 (frint VR512:$src)),
- (VRNDSCALEPSZr VR512:$src, (i32 0x4))>;
-def : Pat<(v16f32 (ftrunc VR512:$src)),
- (VRNDSCALEPSZr VR512:$src, (i32 0x3))>;
-
-def : Pat<(v8f64 (ffloor VR512:$src)),
- (VRNDSCALEPDZr VR512:$src, (i32 0x1))>;
-def : Pat<(v8f64 (fnearbyint VR512:$src)),
- (VRNDSCALEPDZr VR512:$src, (i32 0xC))>;
-def : Pat<(v8f64 (fceil VR512:$src)),
- (VRNDSCALEPDZr VR512:$src, (i32 0x2))>;
-def : Pat<(v8f64 (frint VR512:$src)),
- (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
-def : Pat<(v8f64 (ftrunc VR512:$src)),
- (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
-}
//-------------------------------------------------
// Integer truncate and extend operations
//-------------------------------------------------
-multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
- RegisterClass dstRC, RegisterClass srcRC,
- RegisterClass KRC, X86MemOperand x86memop> {
- def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
- (ins srcRC:$src),
- !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+ X86MemOperand x86memop> {
+
+ defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+ EVEX, T8XS;
+
+ // for intrinsic patter match
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ undef)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+ DestInfo.KRCWM:$mask ,
+ SrcInfo.RC:$src1)>;
+
+ let mayStore = 1 in {
+ def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst |$dst, $src}",
[]>, EVEX;
- def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
- (ins KRC:$mask, srcRC:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+ def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+ (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
[]>, EVEX, EVEX_K;
+ }//mayStore = 1
+}
- def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
- (ins KRC:$mask, srcRC:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- []>, EVEX, EVEX_KZ;
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo,
+ PatFrag truncFrag, PatFrag mtruncFrag > {
- def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- []>, EVEX;
+ def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+ addr:$dst, SrcInfo.RC:$src)>;
- def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
- (ins x86memop:$dst, KRC:$mask, srcRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
- []>, EVEX, EVEX_K;
+ def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+ (SrcInfo.VT SrcInfo.RC:$src)),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+ addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo, string sat > {
+
+ def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+ DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+ addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr,
+ (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM),
+ (SrcInfo.VT SrcInfo.RC:$src))>;
+ def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+ DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+ addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1),
+ (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr,
+ (SrcInfo.VT SrcInfo.RC:$src))>;
}
-defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
-defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
- i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
-defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM,
- i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM,
- i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
- i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
-defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM,
- i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM,
- i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
- i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
-defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
- i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
-
-def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>;
-def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>;
-def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>;
-def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>;
-def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>;
-
-def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
- (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
-def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
- (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
-def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))),
- (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>;
-def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))),
- (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>;
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+ Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+ DestInfoZ128, x86memopZ128>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ truncFrag, mtruncFrag>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+ DestInfoZ256, x86memopZ256>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ truncFrag, mtruncFrag>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+ DestInfoZ, x86memopZ>,
+ avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+ X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+ X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{
+
+ let Predicates = [HasVLX, prd] in {
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+ DestInfoZ128, x86memopZ128>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+ sat>, EVEX_V128;
+
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+ DestInfoZ256, x86memopZ256>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+ sat>, EVEX_V256;
+ }
+ let Predicates = [prd] in
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+ DestInfoZ, x86memopZ>,
+ avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+ sat>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+ truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>;
+}
+multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+ sat>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+ truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>;
+}
+multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info,
+ v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+ sat>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+ truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>;
+}
+multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info,
+ v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+ sat>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+ truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>;
+}
+multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info,
+ v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+ sat>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+ truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info,
+ v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+ sat>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+ truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> {
+ defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info,
+ v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+ sat, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>;
+defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>;
+defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>;
+
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>;
+defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>;
+defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>;
+
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>;
+defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>;
+defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>;
+
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>;
+defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>;
+defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>;
+
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>;
+defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>;
+defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>;
+
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>;
+defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>;
+defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+ (v8i16 (EXTRACT_SUBREG
+ (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+ (v4i32 (EXTRACT_SUBREG
+ (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+ (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0),
+ VR256X:$src, sub_ymm))), sub_xmm))>;
+}
multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
@@ -5985,163 +6566,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd
defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-//===----------------------------------------------------------------------===//
-// VSHUFPS - VSHUFPD Operations
-
-multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
- ValueType vt, string OpcodeStr, PatFrag mem_frag,
- Domain d> {
- def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2, u8imm:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
- (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
- EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
- def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, u8imm:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
- EVEX_4V, Sched<[WriteShuffle]>;
-}
-
-defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32,
- SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64,
- SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
- (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
-def : Pat<(v16i32 (X86Shufp VR512:$src1,
- (loadv16i32 addr:$src2), (i8 imm:$imm))),
- (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
- (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
-def : Pat<(v8i64 (X86Shufp VR512:$src1,
- (loadv8i64 addr:$src2), (i8 imm:$imm))),
- (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
// Helper fragments to match sext vXi1 to vXiY.
def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
-multiclass avx512_conflict<bits<8> opc, string OpcodeStr,
- RegisterClass RC, RegisterClass KRC,
- X86MemOperand x86memop,
- X86MemOperand x86scalar_mop, string BrdcstStr> {
- let hasSideEffects = 0 in {
- def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
- []>, EVEX;
- let mayLoad = 1 in
- def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
- []>, EVEX;
- let mayLoad = 1 in
- def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins x86scalar_mop:$src),
- !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
- ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
- []>, EVEX, EVEX_B;
- def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
- (ins KRC:$mask, RC:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- []>, EVEX, EVEX_KZ;
- let mayLoad = 1 in
- def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins KRC:$mask, x86memop:$src),
- !strconcat(OpcodeStr,
- "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
- []>, EVEX, EVEX_KZ;
- let mayLoad = 1 in
- def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins KRC:$mask, x86scalar_mop:$src),
- !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
- ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
- BrdcstStr, "}"),
- []>, EVEX, EVEX_KZ, EVEX_B;
-
- let Constraints = "$src1 = $dst" in {
- def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, KRC:$mask, RC:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
- []>, EVEX, EVEX_K;
- let mayLoad = 1 in
- def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, KRC:$mask, x86memop:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
- []>, EVEX, EVEX_K;
- let mayLoad = 1 in
- def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
- !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
- ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
- []>, EVEX, EVEX_K, EVEX_B;
- }
- }
-}
-
-let Predicates = [HasCDI] in {
-defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
- i512mem, i32mem, "{1to16}">,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-
-defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
- i512mem, i64mem, "{1to8}">,
- EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-}
-
-def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1,
- GR16:$mask),
- (VPCONFLICTDrrk VR512:$src1,
- (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
-
-def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
- GR8:$mask),
- (VPCONFLICTQrrk VR512:$src1,
- (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
-
-let Predicates = [HasCDI] in {
-defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
- i512mem, i32mem, "{1to16}">,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-
-defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
- i512mem, i64mem, "{1to8}">,
- EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-}
-
-def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
- GR16:$mask),
- (VPLZCNTDrrk VR512:$src1,
- (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
-
-def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
- GR8:$mask),
- (VPLZCNTQrrk VR512:$src1,
- (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
-
-def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))),
- (VPLZCNTDrm addr:$src)>;
-def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
- (VPLZCNTDrr VR512:$src)>;
-def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))),
- (VPLZCNTQrm addr:$src)>;
-def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
- (VPLZCNTQrr VR512:$src)>;
-
def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
@@ -6197,7 +6626,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX;
+ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
}
multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
@@ -6230,7 +6659,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
(_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
let mayStore = 1 in {
@@ -6242,7 +6671,7 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
def mrk : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
- [(store (_.VT (vselect _.KRCWM:$mask,
+ [(store (_.VT (vselect _.KRCWM:$mask,
(_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)),
addr:$dst)]>,
EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
@@ -6272,7 +6701,7 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info
multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+ (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
(_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
let mayLoad = 1 in
@@ -6302,6 +6731,62 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
EVEX, VEX_W;
+//handle instruction reg_vec1 = op(reg_vec,imm)
+// op(mem_vec,imm)
+// op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.MemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>;
+ defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+ "${src1}"##_.BroadcastStr##", $src2",
+ (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+ (i32 imm:$src2),
+ (i32 FROUND_CURRENT))>, EVEX_B;
+ }
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, X86VectorVTInfo _>{
+ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, i32u8imm:$src2),
+ OpcodeStr##_.Suffix, "$src2,{sae}, $src1",
+ "$src1, {sae}, $src2",
+ (OpNode (_.VT _.RC:$src1),
+ (i32 imm:$src2),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ let Predicates = [prd] in {
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ EVEX_V512;
+ }
+ let Predicates = [prd, HasVLX] in {
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ EVEX_V256;
+ }
+}
+
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_vec,imm)
// op(reg_vec2,broadcast(eltVt),imm)
@@ -6309,49 +6794,60 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _>{
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_CURRENT))>;
let mayLoad = 1 in {
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_CURRENT))>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_CURRENT))>, EVEX_B;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+
+ defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT SrcInfo.RC:$src2),
+ (i8 imm:$src3)))>;
+ let mayLoad = 1 in
+ defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+ OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+ (SrcInfo.VT (bitconvert
+ (SrcInfo.LdFrag addr:$src2))),
+ (i8 imm:$src3)))>;
+}
+
+//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
+// op(reg_vec2,mem_vec,imm)
// op(reg_vec2,broadcast(eltVt),imm)
multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
- defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
- OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- (i8 imm:$src3))>;
- let mayLoad = 1 in {
- defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
- OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
- (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i8 imm:$src3))>;
+ X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+ let mayLoad = 1 in
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
@@ -6359,7 +6855,6 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
(i8 imm:$src3))>, EVEX_B;
- }
}
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -6369,20 +6864,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_CURRENT))>;
let mayLoad = 1 in {
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector
(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_CURRENT))>;
let isAsmParserOnly = 1 in {
@@ -6398,18 +6893,25 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86VectorVTInfo _>{
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3,{sae}, $src2, $src1",
"$src1, $src2,{sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i8 imm:$src3),
+ (i32 imm:$src3),
(i32 FROUND_NO_EXC))>, EVEX_B;
}
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86VectorVTInfo _> {
- defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>;
+ defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+ OpcodeStr, "$src3,{sae}, $src2, $src1",
+ "$src1, $src2,{sae}, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (i32 imm:$src3),
+ (i32 FROUND_NO_EXC))>, EVEX_B;
}
multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
@@ -6428,6 +6930,20 @@ multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
}
}
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+ AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+ let Predicates = [HasBWI] in {
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+ SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+ }
+ let Predicates = [HasBWI, HasVLX] in {
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+ SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256,
+ SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+ }
+}
+
multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
bits<8> opc, SDNode OpNode>{
let Predicates = [HasAVX512] in {
@@ -6447,6 +6963,14 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
}
}
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+ bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+ opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+ defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+ opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
@@ -6461,6 +6985,14 @@ defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
0x55, X86VFixupimm, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+ X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+ X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+ X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
0x50, X86VRange, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
@@ -6475,6 +7007,19 @@ defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
0x51, X86VRange, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+ 0x57, X86Reduces, HasDQI>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+ 0x27, X86GetMants, HasAVX512>,
+ AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
bits<8> opc, SDNode OpNode = X86Shuf128>{
@@ -6486,6 +7031,29 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
}
}
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
@@ -6496,31 +7064,51 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
- AVX512VLVectorVTInfo VTInfo_FP>{
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
AVX512AIi8Base, EVEX_4V;
- let isCodeGenOnly = 1 in {
- defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
- AVX512AIi8Base, EVEX_4V;
- }
}
-defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
EVEX_CD8<32, CD8VF>;
-defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
EVEX_CD8<64, CD8VF>, VEX_W;
+multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+ let Predicates = p in
+ def NAME#_.VTName#rri:
+ Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+ (!cast<Instruction>(NAME#_.ZSuffix#rri)
+ _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>:
+ avx512_vpalign_lowering<_.info512, [HasBWI]>,
+ avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>,
+ avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+ avx512vl_i8_info, avx512vl_i8_info>,
+ avx512_vpalign_lowering_common<avx512vl_i16_info>,
+ avx512_vpalign_lowering_common<avx512vl_i32_info>,
+ avx512_vpalign_lowering_common<avx512vl_f32_info>,
+ avx512_vpalign_lowering_common<avx512vl_i64_info>,
+ avx512_vpalign_lowering_common<avx512vl_f64_info>,
+ EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+ avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1), OpcodeStr,
"$src1", "$src1",
(_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
let mayLoad = 1 in
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.MemOp:$src1), OpcodeStr##_.Suffix,
+ (ins _.MemOp:$src1), OpcodeStr,
"$src1", "$src1",
(_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
@@ -6531,7 +7119,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
let mayLoad = 1 in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix,
+ (ins _.ScalarMemOp:$src1), OpcodeStr,
"${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr,
(_.VT (OpNode (X86VBroadcast
@@ -6568,15 +7156,16 @@ multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
SDNode OpNode, Predicate prd> {
- defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info,
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
prd>, VEX_W;
- defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>;
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+ prd>;
}
multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
SDNode OpNode, Predicate prd> {
- defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>;
- defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>;
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
}
multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
@@ -6598,3 +7187,332 @@ def : Pat<(xor
(bc_v8i64 (v8i1sextv8i64)),
(bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
(VPABSQZrr VR512:$src)>;
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+ defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+ HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+ (_.VT (OpNode (_.VT (scalar_to_vector
+ (_.ScalarLdFrag addr:$src)))))>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo> {
+
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ EVEX_V128;
+ }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode,
+ avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+ (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+ SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _> {
+ let mayStore = 1 in
+ def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+ imm:$src2)))),
+ addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32orGR64:$dst,
+ (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, PD;
+
+ def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+ EVEX, TAPD;
+
+ defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+ }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+ RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+ (ins _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GRC:$dst,
+ (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+ EVEX, TAPD;
+
+ let mayStore = 1 in
+ def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+ (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(store (extractelt (_.VT _.RC:$src1),
+ imm:$src2),addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+ }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, PatFrag LdFrag> {
+ let Predicates = [HasBWI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+ }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _, RegisterClass GRC> {
+ let Predicates = [HasDQI] in {
+ def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set _.RC:$dst,
+ (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+ EVEX_4V, TAPD;
+
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+ _.ScalarLdFrag>, TAPD;
+ }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+ extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+ extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+ EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+ def rr : AVX512<opc, MRMr,
+ (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+ let mayLoad = 1 in
+ def rm : AVX512<opc, MRMm,
+ (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst,(_.VT (OpNode
+ (_.LdFrag addr:$src1), (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
+ Format MRMm, string OpcodeStr, Predicate prd>{
+ let Predicates = [prd] in
+ defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v8i64_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v4i64x_info>, EVEX_V256;
+ defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, v2i64x_info>, EVEX_V128;
+ }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
+ HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, X86VectorVTInfo _dst,
+ X86VectorVTInfo _src>{
+ def rr : AVX512BI<opc, MRMSrcReg,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT _src.RC:$src2))))]>;
+ let mayLoad = 1 in
+ def rm : AVX512BI<opc, MRMSrcMem,
+ (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _dst.RC:$dst,(_dst.VT
+ (OpNode (_src.VT _src.RC:$src1),
+ (_src.VT (bitconvert
+ (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
+ string OpcodeStr, Predicate prd> {
+ let Predicates = [prd] in
+ defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+ v64i8_info>, EVEX_V512;
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+ v32i8x_info>, EVEX_V256;
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+ v16i8x_info>, EVEX_V128;
+ }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
+ HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst" in {
+ defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT _.RC:$src3),
+ (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (bitconvert (_.LdFrag addr:$src3))),
+ (i8 imm:$src4))>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+ OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ (OpNode (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2),
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (i8 imm:$src4))>, EVEX_B,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ }
+ }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+ let Predicates = [HasAVX512] in
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+ }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 5e19ad448fc7..1a2e786661e9 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
-def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
- Imm8 , i8imm , imm, i8imm , invalid_node,
+def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+ Imm8, i8imm, imm8_su, i8imm, invalid_node,
0, OpSizeFixed, 0>;
def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
- Imm16, i16imm, imm, i16i8imm, i16immSExt8,
+ Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
1, OpSize16, 0>;
def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
- Imm32, i32imm, imm, i32i8imm, i32immSExt8,
+ Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
1, OpSize32, 0>;
def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
@@ -928,15 +928,22 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
let hasSideEffects = 0;
}
-// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
// and use EFLAGS.
-class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- Register areg, string operands>
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
: BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
IIC_BIN_CARRY_NONMEM> {
let Uses = [areg, EFLAGS];
}
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+ Register areg, string operands>
+ : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+ let Defs = [EFLAGS];
+}
+
/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
/// defined with "(set GPR:$dst, EFLAGS, (...".
///
@@ -1092,14 +1099,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
}
} // Uses = [EFLAGS], Defs = [EFLAGS]
- def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|al, $src}">;
- def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|ax, $src}">;
- def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|eax, $src}">;
- def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
}
/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1170,14 +1177,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
}
} // Defs = [EFLAGS]
- def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|al, $src}">;
- def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|ax, $src}">;
- def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|eax, $src}">;
- def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
}
@@ -1246,14 +1253,14 @@ let isCompare = 1 in {
"", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
} // Defs = [EFLAGS]
- def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL,
- "{$src, %al|al, $src}">;
- def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX,
- "{$src, %ax|ax, $src}">;
- def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX,
- "{$src, %eax|eax, $src}">;
- def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX,
- "{$src, %rax|rax, $src}">;
+ def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL,
+ "{$src, %al|al, $src}">;
+ def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX,
+ "{$src, %ax|ax, $src}">;
+ def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX,
+ "{$src, %eax|eax, $src}">;
+ def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX,
+ "{$src, %rax|rax, $src}">;
} // isCompare
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 2056056d23a5..787f15bc628e 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -156,10 +156,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
Flags |= MachineMemOperand::MOLoad;
if (MCID.mayStore())
Flags |= MachineMemOperand::MOStore;
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset),
- Flags, MFI.getObjectSize(FI),
- MFI.getObjectAlignment(FI));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
return addOffset(MIB.addFrameIndex(FI), Offset)
.addMemOperand(MMO);
}
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 315f21308c0d..c73c95019f8d 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -13,7 +13,7 @@
//===----------------------------------------------------------------------===//
-// SetCC instructions.
+// CMOV instructions.
multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
isCommutable = 1, SchedRW = [WriteALU] in {
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 7f850d6830e1..5d7283f7bd57 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -132,26 +132,6 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
Requires<[In64BitMode]>;
}
-// The MSVC runtime contains an _ftol2 routine for converting floating-point
-// to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
-// used as a temporary register. No other registers (aside from flags) are
-// touched.
-// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
-// variant is unnecessary.
-
-let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
- def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
- "# win32 fptoui",
- [(X86WinFTOL RFP32:$src)]>,
- Requires<[Not64BitMode]>;
-
- def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
- "# win32 fptoui",
- [(X86WinFTOL RFP64:$src)]>,
- Requires<[Not64BitMode]>;
-}
-
//===----------------------------------------------------------------------===//
// EH Pseudo Instructions
//
@@ -172,6 +152,29 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
}
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+ isCodeGenOnly = 1, isReturn = 1 in {
+ def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+ // CATCHRET needs a custom inserter for SEH.
+ let usesCustomInserter = 1 in
+ def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+ "# CATCHRET",
+ [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
usesCustomInserter = 1 in {
def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
@@ -247,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1 in
+ isPseudo = 1, AddedComplexity = 20 in
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
[(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
@@ -259,6 +262,33 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
let AddedComplexity = 20;
}
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+ AddedComplexity = 15 in {
+ // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+ // which only require 3 bytes compared to MOV32ri which requires 5.
+ let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+ def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 1)]>;
+ def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, -1)]>;
+ }
+
+ // MOV16ri is 4 bytes, so the instructions above are smaller.
+ def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+ def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+// FIXME: Add itinerary class and Schedule.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+ [(set GR32:$dst, i32immSExt8:$src)]>,
+ Requires<[OptForMinSize]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+ [(set GR64:$dst, i64immSExt8:$src)]>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
@@ -268,9 +298,9 @@ def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
"", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
// This 64-bit pseudo-move can be used for both a 64-bit constant that is
-// actually the zero-extension of a 32-bit constant, and for labels in the
+// actually the zero-extension of a 32-bit constant and for labels in the
// x86-64 small code model.
-def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
let AddedComplexity = 1 in
def : Pat<(i64 mov64imm32:$src),
@@ -509,6 +539,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in {
defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
+ defm _FR128 : CMOVrr_PSEUDO<FR128, f128>;
defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>;
defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>;
defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>;
@@ -752,67 +783,111 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
/* The following multiclass tries to make sure that in code like
* x.store (immediate op x.load(acquire), release)
+ * and
+ * x.store (register op x.load(acquire), release)
* an operation directly on memory is generated instead of wasting a register.
* It is not automatic as atomic_store/load are only lowered to MOV instructions
* extremely late to prevent them from being accidentally reordered in the backend
* (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
*/
-multiclass RELEASE_BINOP_MI<string op> {
+multiclass RELEASE_BINOP_MI<SDNode op> {
def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
+ "#BINOP "#NAME#"8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
(atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+ def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+ "#BINOP "#NAME#"8mr PSEUDO!",
+ [(atomic_store_8 addr:$dst, (op
+ (atomic_load_8 addr:$dst), GR8:$src))]>;
// NAME#16 is not generated as 16-bit arithmetic instructions are considered
// costly and avoided as far as possible by this backend anyway
def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
+ "#BINOP "#NAME#"32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
(atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst, (op
+ (atomic_load_32 addr:$dst), GR32:$src))]>;
def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
+ "#BINOP "#NAME#"64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
(atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst, (op
+ (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+ defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+ defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+ defm RELEASE_OR : RELEASE_BINOP_MI<or>;
+ defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+ // Note: we don't deal with sub, because substractions of constants are
+ // optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+ def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+ "#BINOP "#NAME#"32mr PSEUDO!",
+ [(atomic_store_32 addr:$dst,
+ (i32 (bitconvert (op
+ (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+ FR32:$src))))]>, Requires<[HasSSE1]>;
+ def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+ "#BINOP "#NAME#"64mr PSEUDO!",
+ [(atomic_store_64 addr:$dst,
+ (i64 (bitconvert (op
+ (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+ FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
}
-defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
-defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
-defm RELEASE_OR : RELEASE_BINOP_MI<"or">;
-defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
-// Note: we don't deal with sub, because substractions of constants are
-// optimized into additions before this code can run
multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
+ "#UNOP "#NAME#"8m PSEUDO!",
[(atomic_store_8 addr:$dst, dag8)]>;
def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
+ "#UNOP "#NAME#"16m PSEUDO!",
[(atomic_store_16 addr:$dst, dag16)]>;
def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
+ "#UNOP "#NAME#"32m PSEUDO!",
[(atomic_store_32 addr:$dst, dag32)]>;
def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
+ "#UNOP "#NAME#"64m PSEUDO!",
[(atomic_store_64 addr:$dst, dag64)]>;
}
-defm RELEASE_INC : RELEASE_UNOP<
- (add (atomic_load_8 addr:$dst), (i8 1)),
- (add (atomic_load_16 addr:$dst), (i16 1)),
- (add (atomic_load_32 addr:$dst), (i32 1)),
- (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
-defm RELEASE_DEC : RELEASE_UNOP<
- (add (atomic_load_8 addr:$dst), (i8 -1)),
- (add (atomic_load_16 addr:$dst), (i16 -1)),
- (add (atomic_load_32 addr:$dst), (i32 -1)),
- (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+let Defs = [EFLAGS] in {
+ defm RELEASE_INC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 1)),
+ (add (atomic_load_16 addr:$dst), (i16 1)),
+ (add (atomic_load_32 addr:$dst), (i32 1)),
+ (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+ defm RELEASE_DEC : RELEASE_UNOP<
+ (add (atomic_load_8 addr:$dst), (i8 -1)),
+ (add (atomic_load_16 addr:$dst), (i16 -1)),
+ (add (atomic_load_32 addr:$dst), (i32 -1)),
+ (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
/*
TODO: These don't work because the type inference of TableGen fails.
TODO: find a way to fix it.
-defm RELEASE_NEG : RELEASE_UNOP<
- (ineg (atomic_load_8 addr:$dst)),
- (ineg (atomic_load_16 addr:$dst)),
- (ineg (atomic_load_32 addr:$dst)),
- (ineg (atomic_load_64 addr:$dst))>;
+let Defs = [EFLAGS] in {
+ defm RELEASE_NEG : RELEASE_UNOP<
+ (ineg (atomic_load_8 addr:$dst)),
+ (ineg (atomic_load_16 addr:$dst)),
+ (ineg (atomic_load_32 addr:$dst)),
+ (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
defm RELEASE_NOT : RELEASE_UNOP<
(not (atomic_load_8 addr:$dst)),
(not (atomic_load_16 addr:$dst)),
@@ -821,42 +896,42 @@ defm RELEASE_NOT : RELEASE_UNOP<
*/
def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
- "#RELEASE_MOV PSEUDO !",
+ "#RELEASE_MOV8mi PSEUDO!",
[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
- "#RELEASE_MOV PSEUDO !",
+ "#RELEASE_MOV16mi PSEUDO!",
[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
- "#RELEASE_MOV PSEUDO !",
+ "#RELEASE_MOV32mi PSEUDO!",
[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
- "#RELEASE_MOV PSEUDO !",
+ "#RELEASE_MOV64mi32 PSEUDO!",
[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
- "#RELEASE_MOV PSEUDO!",
+ "#RELEASE_MOV8mr PSEUDO!",
[(atomic_store_8 addr:$dst, GR8 :$src)]>;
def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
- "#RELEASE_MOV PSEUDO!",
+ "#RELEASE_MOV16mr PSEUDO!",
[(atomic_store_16 addr:$dst, GR16:$src)]>;
def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
- "#RELEASE_MOV PSEUDO!",
+ "#RELEASE_MOV32mr PSEUDO!",
[(atomic_store_32 addr:$dst, GR32:$src)]>;
def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
- "#RELEASE_MOV PSEUDO!",
+ "#RELEASE_MOV64mr PSEUDO!",
[(atomic_store_64 addr:$dst, GR64:$src)]>;
def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
- "#ACQUIRE_MOV PSEUDO!",
+ "#ACQUIRE_MOV8rm PSEUDO!",
[(set GR8:$dst, (atomic_load_8 addr:$src))]>;
def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
+ "#ACQUIRE_MOV16rm PSEUDO!",
[(set GR16:$dst, (atomic_load_16 addr:$src))]>;
def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
+ "#ACQUIRE_MOV32rm PSEUDO!",
[(set GR32:$dst, (atomic_load_32 addr:$src))]>;
def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
+ "#ACQUIRE_MOV64rm PSEUDO!",
[(set GR64:$dst, (atomic_load_64 addr:$src))]>;
//===----------------------------------------------------------------------===//
@@ -1077,11 +1152,11 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
// zextload bool -> zextload byte
def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
-def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>;
-def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>;
+def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>;
+def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>;
def : Pat<(zextloadi64i1 addr:$src),
(SUBREG_TO_REG (i64 0),
- (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
+ (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
// extload bool -> extload byte
// When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1298,7 +1373,6 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
(MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
sub_32bit)>;
// r & (2^16-1) ==> movz
-let AddedComplexity = 1 in // Give priority over i64immZExt32.
def : Pat<(and GR64:$src, 0xffff),
(SUBREG_TO_REG (i64 0),
(MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 4cd5563ce727..8c351a51c460 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -53,6 +53,19 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
"{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
"{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+ // The machine return from interrupt instruction, but sometimes we need to
+ // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+ // which expands to include an SP adjustment if necessary.
+ def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+ OpSize16;
+ def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+ IIC_IRET>, OpSize32;
+ def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
+ IIC_IRET>, Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in
+ def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>;
+
}
// Unconditional branches.
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 7cc3b599a737..fd800cf077f7 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -15,13 +15,31 @@
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
-let Constraints = "$src1 = $dst" in {
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+// operands 1 and 3 (register forms only): *213* --> *231*;
+// operands 2 and 3 (register forms only): *213* --> *132*.
+// FMA*132*:
+// operands 1 and 2 (memory & register forms): *132* --> *231*;
+// operands 1 and 3 (register forms only): *132* --> *132*(no changes);
+// operands 2 and 3 (register forms only): *132* --> *213*.
+// FMA*231*:
+// operands 1 and 2 (memory & register forms): *231* --> *132*;
+// operands 1 and 3 (register forms only): *231* --> *213*;
+// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
PatFrag MemFrag128, PatFrag MemFrag256,
ValueType OpVT128, ValueType OpVT256,
- bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
SDPatternOperator Op = null_frag> {
- let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+ let usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
VR128:$src1, VR128:$src3)))]>;
- let mayLoad = 1, isCommutable = IsMVariantCommutable in
+ let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
@@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
(MemFrag128 addr:$src3))))]>;
- let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+ let usesCustomInserter = 1 in
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
@@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
VR256:$src3)))]>, VEX_L;
- let mayLoad = 1, isCommutable = IsMVariantCommutable in
+ let mayLoad = 1 in
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
@@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
(OpVT256 (Op VR256:$src2, VR256:$src1,
(MemFrag256 addr:$src3))))]>, VEX_L;
}
-} // Constraints = "$src1 = $dst"
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy,
PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
- // For 213, both the register and memory variant are commutable.
- // Indeed, the commutable operands are 1 and 2 and both live in registers
- // for both variants.
defm r213 : fma3p_rm<opc213,
!strconcat(OpcodeStr, "213", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256,
- /* IsRVariantCommutable */ 1,
- /* IsMVariantCommutable */ 1,
- Op>;
-let hasSideEffects = 0 in {
+ MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
defm r132 : fma3p_rm<opc132,
!strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;
- // For 231, only the register variant is commutable.
- // For the memory variant the folded operand must be in 3. Thus,
- // in that case, it cannot be swapped with 2.
defm r231 : fma3p_rm<opc231,
!strconcat(OpcodeStr, "231", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256,
- /* IsRVariantCommutable */ 1,
- /* IsMVariantCommutable */ 0>;
-} // hasSideEffects = 0
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
}
// Fused Multiply-Add
@@ -126,83 +130,122 @@ let ExeDomain = SSEPackedDouble in {
v4f64>, VEX_W;
}
-let Constraints = "$src1 = $dst" in {
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
- RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
- bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+// -->
+// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
SDPatternOperator OpNode = null_frag> {
- let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
+ let usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set RC:$dst,
- (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+ [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
- let mayLoad = 1, isCommutable = IsMVariantCommutable in
+ let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src2, RC:$src1,
- (mem_frag addr:$src3))))]>;
+ (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+ Operand memopr, RegisterClass RC> {
+ def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
+
+ let mayLoad = 1 in
+ def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
}
-} // Constraints = "$src1 = $dst"
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, string PackTy, string PT2, Intrinsic Int,
- SDNode OpNode, RegisterClass RC, ValueType OpVT,
- X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
- ComplexPattern mem_cpat> {
-let hasSideEffects = 0 in {
- defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
- x86memop, RC, OpVT, mem_frag>;
- // See the other defm of r231 for the explanation regarding the
- // commutable flags.
- defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
- x86memop, RC, OpVT, mem_frag,
- /* IsRVariantCommutable */ 1,
- /* IsMVariantCommutable */ 0>;
+ string OpStr, string PackTy,
+ SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop> {
+ defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
+ defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
+ OpNode>;
+ defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
}
-// See the other defm of r213 for the explanation regarding the
-// commutable flags.
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
- x86memop, RC, OpVT, mem_frag,
- /* IsRVariantCommutable */ 1,
- /* IsMVariantCommutable */ 1,
- OpNode>;
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy,
+ RegisterClass RC, Operand memop> {
+ defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+ memop, RC>;
+ defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, RC>;
+ defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+ memop, RC>;
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, Intrinsic IntF32, Intrinsic IntF64,
SDNode OpNode> {
- defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
- FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
- defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
- FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+ let ExeDomain = SSEPackedSingle in
+ defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
+ FR32, f32mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+ let ExeDomain = SSEPackedDouble in
+ defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+ FR64, f64mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
+ VEX_W;
-// These patterns use the 123 ordering, instead of 213, even though
-// they match the intrinsic to the 213 version of the instruction.
-// This is because src1 is tied to dest, and the scalar intrinsics
-// require the pass-through values to come from the first source
-// operand, not the second.
+ // These patterns use the 123 ordering, instead of 213, even though
+ // they match the intrinsic to the 213 version of the instruction.
+ // This is because src1 is tied to dest, and the scalar intrinsics
+ // require the pass-through values to come from the first source
+ // operand, not the second.
def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(NAME#"SSr213r")
- (COPY_TO_REGCLASS $src1, FR32),
- (COPY_TO_REGCLASS $src2, FR32),
- (COPY_TO_REGCLASS $src3, FR32)),
- VR128)>;
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int")
+ $src1, $src2, $src3), VR128)>;
def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(NAME#"SDr213r")
- (COPY_TO_REGCLASS $src1, FR64),
- (COPY_TO_REGCLASS $src2, FR64),
- (COPY_TO_REGCLASS $src3, FR64)),
- VR128)>;
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int")
+ $src1, $src2, $src3), VR128)>;
}
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
@@ -334,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
} // isCodeGenOnly = 1
}
-defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
- fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
- fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfmadd_sd>;
-defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
- fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
- fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfmsub_sd>;
-defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
- X86Fnmadd, loadf32>,
- fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
- X86Fnmadd, loadf64>,
- fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmadd_sd>;
-defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
- X86Fnmsub, loadf32>,
- fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
- X86Fnmsub, loadf64>,
- fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmsub_sd>;
-
let ExeDomain = SSEPackedSingle in {
+ // Scalar Instructions
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+ fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfmadd_ss>;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+ fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfmsub_ss>;
+ defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+ X86Fnmadd, loadf32>,
+ fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmadd_ss>;
+ defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+ X86Fnmsub, loadf32>,
+ fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+ int_x86_fma_vfnmsub_ss>;
+ // Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
@@ -379,6 +409,22 @@ let ExeDomain = SSEPackedSingle in {
}
let ExeDomain = SSEPackedDouble in {
+ // Scalar Instructions
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+ fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmadd_sd>;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+ fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfmsub_sd>;
+ defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+ X86Fnmadd, loadf64>,
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmadd_sd>;
+ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+ X86Fnmsub, loadf64>,
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+ int_x86_fma_vfnmsub_sd>;
+ // Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 49068e9c37d3..03ae21125b0e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
// The FopST0 series are not included here because of the irregularities
// in where the 'r' goes in assembly output.
// These instructions cannot address 80-bit memory.
-multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+ bit Forward = 1> {
// ST(0) = ST(0) + [mem]
def _Fp32m : FpIf32<(outs RFP32:$dst),
(ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
- [(set RFP32:$dst,
- (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+ (set RFP32:$dst,
+ (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
def _Fp64m : FpIf64<(outs RFP64:$dst),
(ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
- [(set RFP64:$dst,
- (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+ (set RFP64:$dst,
+ (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
def _Fp64m32: FpIf64<(outs RFP64:$dst),
(ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
- [(set RFP64:$dst,
- (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+ (set RFP64:$dst,
+ (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
def _Fp80m32: FpI_<(outs RFP80:$dst),
(ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
- [(set RFP80:$dst,
- (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
def _Fp80m64: FpI_<(outs RFP80:$dst),
(ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
- [(set RFP80:$dst,
- (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+ (set RFP80:$dst,
+ (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
- !strconcat("f", asmstring, "{s}\t$src")> {
- let mayLoad = 1;
-}
+ !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
- !strconcat("f", asmstring, "{l}\t$src")> {
- let mayLoad = 1;
-}
+ !strconcat("f", asmstring, "{l}\t$src")>;
// ST(0) = ST(0) + [memint]
def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
OneArgFPRW,
- [(set RFP32:$dst, (OpNode RFP32:$src1,
- (X86fild addr:$src2, i16)))]>;
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
OneArgFPRW,
- [(set RFP32:$dst, (OpNode RFP32:$src1,
- (X86fild addr:$src2, i32)))]>;
+ [!if(Forward,
+ (set RFP32:$dst,
+ (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+ (set RFP32:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
OneArgFPRW,
- [(set RFP64:$dst, (OpNode RFP64:$src1,
- (X86fild addr:$src2, i16)))]>;
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
OneArgFPRW,
- [(set RFP64:$dst, (OpNode RFP64:$src1,
- (X86fild addr:$src2, i32)))]>;
+ [!if(Forward,
+ (set RFP64:$dst,
+ (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+ (set RFP64:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
- OneArgFPRW,
- [(set RFP80:$dst, (OpNode RFP80:$src1,
- (X86fild addr:$src2, i16)))]>;
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
- OneArgFPRW,
- [(set RFP80:$dst, (OpNode RFP80:$src1,
- (X86fild addr:$src2, i32)))]>;
+ OneArgFPRW,
+ [!if(Forward,
+ (set RFP80:$dst,
+ (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+ (set RFP80:$dst,
+ (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
- !strconcat("fi", asmstring, "{s}\t$src")> {
- let mayLoad = 1;
-}
+ !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
- !strconcat("fi", asmstring, "{l}\t$src")> {
- let mayLoad = 1;
-}
+ !strconcat("fi", asmstring, "{l}\t$src")>;
}
let Defs = [FPSW] in {
@@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>;
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}
let SchedRW = [WriteFDivLd] in {
defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
}
}
@@ -306,13 +336,13 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
-def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
-def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
-def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">;
// Floating point cmovs.
class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
@@ -633,16 +663,18 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
-def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
-def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
- IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
-def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
-def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
- IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+let Predicates = [HasFXSR] in {
+ def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+ def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+ IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+ def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+ IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
} // SchedRW
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1f61ffa84e9a..829cedd55fb3 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,6 +38,8 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>;
def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
SDTCisFP<1>, SDTCisVT<3, i8>,
SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+ SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
@@ -58,13 +60,17 @@ def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>;
def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>;
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>;
def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
//def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>;
def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
@@ -74,11 +80,18 @@ def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
SDTCisVT<1, v4i32>]>>;
def X86pshufb : SDNode<"X86ISD::PSHUFB",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86psadbw : SDNode<"X86ISD::PSADBW",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>]>>;
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, i8>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
def X86andnp : SDNode<"X86ISD::ANDNP",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
@@ -86,9 +99,11 @@ def X86psign : SDNode<"X86ISD::PSIGN",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86pextrb : SDNode<"X86ISD::PEXTRB",
- SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+ SDTCisPtrTy<2>]>>;
def X86pextrw : SDNode<"X86ISD::PEXTRW",
- SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+ SDTCisPtrTy<2>]>>;
def X86pinsrb : SDNode<"X86ISD::PINSRB",
SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
@@ -114,19 +129,17 @@ def X86vsext : SDNode<"X86ISD::VSEXT",
SDTCisInt<0>, SDTCisInt<1>,
SDTCisOpSmallerThanOp<1, 0>]>>;
-def X86vtrunc : SDNode<"X86ISD::VTRUNC",
- SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisInt<0>, SDTCisInt<1>,
- SDTCisOpSmallerThanOp<0, 1>]>>;
+def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>;
+def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>;
+def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
def X86trunc : SDNode<"X86ISD::TRUNC",
SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisOpSmallerThanOp<0, 1>]>>;
-
-def X86vtruncm : SDNode<"X86ISD::VTRUNCM",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisInt<0>, SDTCisInt<1>,
- SDTCisVec<2>, SDTCisInt<2>,
- SDTCisOpSmallerThanOp<0, 2>]>>;
def X86vfpext : SDNode<"X86ISD::VFPEXT",
SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisFP<0>, SDTCisFP<1>,
@@ -136,6 +149,35 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND",
SDTCisFP<0>, SDTCisFP<1>,
SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86fround: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86froundRnd: SDNode<"X86ISD::VFPROUND",
+ SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCVecEltisVT<2, f64>,
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisInt<3>]>>;
+
+def X86fpext : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86fpextRnd : SDNode<"X86ISD::VFPEXT",
+ SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCVecEltisVT<2, f32>,
+ SDTCisOpSmallerThanOp<1, 0>,
+ SDTCisInt<3>]>>;
+
def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
@@ -159,10 +201,15 @@ def X86CmpMaskCCRound :
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
-def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
-def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
-def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>;
+def X86CmpMaskCCScalarRound :
+ SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+ SDTCisInt<4>]>;
+
+def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
+def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>;
def X86vshl : SDNode<"X86ISD::VSHL",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -178,6 +225,29 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+def X86vprot : SDNode<"X86ISD::VPROT",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vproti : SDNode<"X86ISD::VPROTI",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>]>>;
+
+def X86vpshl : SDNode<"X86ISD::VPSHL",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86vpsha : SDNode<"X86ISD::VPSHA",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+
+def X86vpcom : SDNode<"X86ISD::VPCOM",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+def X86vpcomu : SDNode<"X86ISD::VPCOMU",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
+
def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisVec<1>,
SDTCisSameAs<2, 1>]>;
@@ -190,6 +260,7 @@ def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCVecEltisVT<0, i1>,
@@ -201,11 +272,15 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisSameAs<1,2>]>>;
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
def X86pmuldq : SDNode<"X86ISD::PMULDQ",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisSameAs<1,2>]>>;
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+ SDTCVecEltisVT<1, i32>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>>;
def X86extrqi : SDNode<"X86ISD::EXTRQI",
SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
@@ -221,24 +296,30 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>;
-def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisVec<2>]>;
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameNumEltsAs<0,2>]>;
def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
- SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+ SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+ SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisInt<2>, SDTCisInt<3>]>;
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisInt<0>, SDTCisInt<1>]>;
def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+ SDTCisVT<4, i8>]>;
+
def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc.
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
@@ -250,15 +331,17 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
- SDTCisVec<0>, SDTCisInt<2>]>;
+ SDTCisVec<0>, SDTCisVT<2, i32>]>;
def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
- SDTCisVec<0>, SDTCisInt<3>]>;
+ SDTCisVec<0>, SDTCisVT<3, i32>]>;
def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
- SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>;
+ SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
-def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+
+def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
@@ -281,33 +364,74 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
-def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>;
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<1,2>]>;
def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>;
+
def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
-def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
+def X86VPermv : SDNode<"X86ISD::VPERMV",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
-def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>;
-def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>;
+def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+ SDTCisSameSizeAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
-def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
-def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>;
+def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>;
+def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisFP<1>,
+ SDTCisSameNumEltsAs<0,1>,
+ SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+ SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSubVecOfVec<1, 0>]>, []>;
+// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2.
+def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+ SDTypeProfile<1, 1, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>]>, []>;
+
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3,
- [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+ [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+ SDTCisPtrTy<3>]>, []>;
def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
- [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>;
+ [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+ SDTCisPtrTy<2>]>, []>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
@@ -317,11 +441,13 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>;
def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>;
def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>;
def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>;
-def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
-def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
-def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
-def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
-def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>;
+def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
+def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
+def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>;
+def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
@@ -341,9 +467,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>;
def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>;
def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>;
-def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>;
-def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>;
-def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>;
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>;
+def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>;
+def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>;
+def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>;
def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -362,7 +490,8 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
- SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>;
+ SDTCisSameAs<0,1>, SDTCisInt<2>,
+ SDTCisVT<3, i32>]>;
def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
@@ -371,9 +500,12 @@ def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>,
+ SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>;
def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
-
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+ SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>;
def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
SDTCisInt<2>]>;
@@ -392,6 +524,10 @@ def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>;
def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>;
+def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>;
+def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>;
+def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>;
+def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>;
// Vector with rounding mode
// cvtt fp-to-int staff
@@ -417,17 +553,35 @@ def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>;
def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>;
def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>;
+def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>,
+ SDTCisFP<0>,
+ SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisFP<1>, SDTCisVT<2, i32>,
+ SDTCisVT<3, i32>]> >;
def X86vfpextRnd : SDNode<"X86ISD::VFPEXT",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisFP<0>, SDTCisFP<1>,
+ SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
SDTCisOpSmallerThanOp<1, 0>,
- SDTCisInt<2>]>>;
+ SDTCisVT<2, i32>]>>;
def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisFP<0>, SDTCisFP<1>,
SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
- SDTCisInt<2>]>>;
+ SDTCisOpSmallerThanOp<0, 1>,
+ SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
@@ -436,10 +590,10 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
// These are 'extloads' from a scalar to the low element of a vector, zeroing
// the top elements. These are used for the SSE 'ss' and 'sd' instruction
// forms.
-def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
SDNPWantRoot]>;
-def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
SDNPWantRoot]>;
@@ -490,9 +644,9 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
// The memory operand is required to be a 128-bit load, so it must be converted
// from a vector to a scalar.
def loadf32_128 : PatFrag<(ops node:$ptr),
- (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
+ (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
def loadf64_128 : PatFrag<(ops node:$ptr),
- (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
+ (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
// Like 'store', but always requires 128-bit vector alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -590,9 +744,9 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
// The memory operand is required to be a 128-bit load, so it must be converted
// from a vector to a scalar.
def memopfsf32_128 : PatFrag<(ops node:$ptr),
- (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
+ (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
def memopfsf64_128 : PatFrag<(ops node:$ptr),
- (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
+ (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
@@ -604,32 +758,6 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
-// MOVNT Support
-// Like 'store', but requires the non-temporal bit to be set
-def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
- if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
- return ST->isNonTemporal();
- return false;
-}]>;
-
-def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
- if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
- return ST->isNonTemporal() && !ST->isTruncatingStore() &&
- ST->getAddressingMode() == ISD::UNINDEXED &&
- ST->getAlignment() >= 16;
- return false;
-}]>;
-
-def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
- (st node:$val, node:$ptr), [{
- if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
- return ST->isNonTemporal() &&
- ST->getAlignment() < 16;
- return false;
-}]>;
-
def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_gather node:$src1, node:$src2, node:$src3) , [{
if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -851,29 +979,59 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
return isa<MaskedLoadSDNode>(N);
}]>;
+// masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+
def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
return Store->getAlignment() >= 16;
return false;
}]>;
def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
return Store->getAlignment() >= 32;
return false;
}]>;
def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
return Store->getAlignment() >= 64;
return false;
}]>;
def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_store node:$src1, node:$src2, node:$src3), [{
+ (X86mstore node:$src1, node:$src2, node:$src3), [{
return isa<MaskedStoreSDNode>(N);
}]>;
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+ PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index cf68ef053361..63e78de69bc9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
@@ -101,9 +102,11 @@ struct X86MemoryFoldTableEntry {
void X86InstrInfo::anchor() {}
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
- : X86GenInstrInfo(
- (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
- (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+ : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32),
+ X86::CATCHRET),
Subtarget(STI), RI(STI.getTargetTriple()) {
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
@@ -332,6 +335,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
@@ -495,7 +501,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
{ X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
@@ -605,7 +610,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },
{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },
@@ -1647,6 +1651,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PEXT32rr, X86::PEXT32rm, 0 },
{ X86::PEXT64rr, X86::PEXT64rm, 0 },
+ // ADX foldable instructions
+ { X86::ADCX32rr, X86::ADCX32rm, 0 },
+ { X86::ADCX64rr, X86::ADCX64rm, 0 },
+ { X86::ADOX32rr, X86::ADOX32rm, 0 },
+ { X86::ADOX64rr, X86::ADOX64rm, 0 },
+
// AVX-512 foldable instructions
{ X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
@@ -1729,11 +1739,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
// FMA foldable instructions
{ X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
{ X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
@@ -1749,11 +1765,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
{ X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
{ X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
@@ -1769,11 +1791,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
{ X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
{ X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
@@ -1789,11 +1817,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
{ X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
@@ -2282,7 +2316,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
case X86::FsVMOVAPSrm:
case X86::FsVMOVAPDrm:
case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm: {
+ case X86::FsMOVAPDrm:
+ // AVX-512
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
@@ -2363,9 +2425,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
// It is safe to clobber EFLAGS at the end of a block of no successor has it
// live in.
if (Iter == E) {
- for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
- SE = MBB.succ_end(); SI != SE; ++SI)
- if ((*SI)->isLiveIn(X86::EFLAGS))
+ for (MachineBasicBlock *S : MBB.successors())
+ if (S->isLiveIn(X86::EFLAGS))
return false;
return true;
}
@@ -2411,13 +2472,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
unsigned DestReg, unsigned SubIdx,
const MachineInstr *Orig,
const TargetRegisterInfo &TRI) const {
- // MOV32r0 is implemented with a xor which clobbers condition code.
- // Re-materialize it as movri instructions to avoid side effects.
- unsigned Opc = Orig->getOpcode();
- if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+ bool ClobbersEFLAGS = false;
+ for (const MachineOperand &MO : Orig->operands()) {
+ if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+ ClobbersEFLAGS = true;
+ break;
+ }
+ }
+
+ if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+ // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+ // effects.
+ int Value;
+ switch (Orig->getOpcode()) {
+ case X86::MOV32r0: Value = 0; break;
+ case X86::MOV32r1: Value = 1; break;
+ case X86::MOV32r_1: Value = -1; break;
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ }
+
DebugLoc DL = Orig->getDebugLoc();
BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
- .addImm(0);
+ .addImm(Value);
} else {
MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
MBB.insert(I, MI);
@@ -2428,7 +2505,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
}
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
-static bool hasLiveCondCodeDef(MachineInstr *MI) {
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (MO.isReg() && MO.isDef() &&
@@ -2453,7 +2530,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
- // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
@@ -2493,7 +2570,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
ImplicitOp = Src;
ImplicitOp.setImplicit();
- NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+ NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
MachineBasicBlock::LivenessQueryResult LQR =
MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
@@ -2914,10 +2991,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
return NewMI;
}
-/// We have a few instructions that must be hacked on to commute them.
-///
-MachineInstr *
-X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+/// The second parameter is optional and is used as the second return from
+/// the function. It is set to true if the given instruction has FMA3 opcode
+/// that is used for lowering of scalar FMA intrinsics, and it is set to false
+/// otherwise.
+static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
+ if (IsIntrinsic)
+ *IsIntrinsic = false;
+
+ switch (Opcode) {
+ case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
+ case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
+ case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
+ case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
+ case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
+ case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
+ case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
+ case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
+
+ case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
+ case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
+ case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
+ case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
+ case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
+ case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
+ case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
+ case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
+
+ case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
+ case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
+ case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
+ case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
+ case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
+ case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
+ case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
+ case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
+
+ case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
+ case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
+ case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
+ case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
+ case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
+ case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
+ case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
+ case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
+
+ case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
+ case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
+ case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
+ case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
+ case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
+ case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
+ case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
+ case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
+ case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
+ case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
+ case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
+ case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
+ case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
+ case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
+ case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
+ case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
+
+ case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
+ case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
+ case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
+ case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
+ case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
+ case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
+ case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
+ case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
+
+ case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
+ case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
+ case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
+ case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
+ case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
+ case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
+ case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
+ case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
+ case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
+ case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
+ case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
+ case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
+ case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
+ case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
+ case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
+ case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
+
+ case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
+ case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
+ case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
+ case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
+ case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
+ case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
+ case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
+ case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
+
+ case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
+ case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
+ case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
+ case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
+ case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
+ case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
+ case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
+ case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
+ case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
+ case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
+ case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
+ case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
+ case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
+ case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
+ case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
+ case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
+ return true;
+
+ case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int:
+ case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
+ case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
+ case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
+ case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
+
+ case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int:
+ case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
+ case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
+ case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
+ case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
+
+ case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int:
+ case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
+ case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
+ case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
+ case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
+ if (IsIntrinsic)
+ *IsIntrinsic = true;
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Opcode not handled by the switch");
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
switch (MI->getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
@@ -2944,7 +3173,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
}
MI->setDesc(get(Opc));
MI->getOperand(3).setImm(Size-Amt);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
@@ -2980,7 +3209,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
NewMI = false;
}
MI->getOperand(3).setImm(Mask ^ Imm);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{
@@ -2995,7 +3224,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
NewMI = false;
}
MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::CMPPDrri:
case X86::CMPPSrri:
@@ -3016,7 +3245,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
MI = MF.CloneMachineInstr(MI);
NewMI = false;
}
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
default:
return nullptr;
}
@@ -3045,7 +3274,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
NewMI = false;
}
MI->getOperand(3).setImm(Imm);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
@@ -3124,11 +3353,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
// Fallthrough intended.
}
default:
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ if (isFMA3(MI->getOpcode())) {
+ unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+ if (Opc == 0)
+ return nullptr;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ }
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+ }
+}
+
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+
+ unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+ // Only the first RegOpsNum operands are commutable.
+ // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+ // that the operand is not specified/fixed.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+ (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ unsigned CommutableOpIdx1 = SrcOpIdx1;
+ unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ if (SrcOpIdx1 == SrcOpIdx2)
+ // Both of operands are not fixed. By default set one of commutable
+ // operands to the last register operand of the instruction.
+ CommutableOpIdx2 = RegOpsNum;
+ else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+ // Only one of operands is not fixed.
+ CommutableOpIdx2 = SrcOpIdx1;
+
+ // CommutableOpIdx2 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx1.
+ unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+ for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+ // The commuted operands must have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless then.
+ if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+ break;
+ }
+
+ // No appropriate commutable operands were found.
+ if (CommutableOpIdx1 == 0)
+ return false;
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+ // to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ }
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) const {
+ unsigned Opc = MI->getOpcode();
+
+ // Define the array that holds FMA opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned RegularOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
+ { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
+ { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
+ { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
+ { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
+ { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
+ { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
+
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
+ { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
+ { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
+ { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
+ { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
+ { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
+ { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
+
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
+ { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
+ { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
+ { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
+ { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
+ { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
+ { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
+
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
+ { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
+ { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
+ { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
+ { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
+ { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
+ { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
+
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
+ { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
+ { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
+ { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
+ { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
+
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
+ { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
+ { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
+ { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
+ { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
+ };
+
+ // Define the array that holds FMA*_Int opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned IntrinOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int },
+ { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int },
+ { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int },
+
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int },
+ { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int },
+ { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int },
+
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
+ { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
+ { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
+
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
+ { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
+ { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
+ };
+
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ const unsigned FormsNum = 3;
+
+ bool IsIntrinOpcode;
+ isFMA3(Opc, &IsIntrinOpcode);
+
+ size_t GroupsNum;
+ const unsigned (*OpcodeGroups)[3];
+ if (IsIntrinOpcode) {
+ GroupsNum = array_lengthof(IntrinOpcodeGroups);
+ OpcodeGroups = IntrinOpcodeGroups;
+ } else {
+ GroupsNum = array_lengthof(RegularOpcodeGroups);
+ OpcodeGroups = RegularOpcodeGroups;
+ }
+
+ const unsigned *FoundOpcodesGroup = nullptr;
+ size_t FormIndex;
+
+ // Look for the input opcode in the corresponding opcodes table.
+ for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+ ++GroupIndex) {
+ for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
+ if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
+ FoundOpcodesGroup = OpcodeGroups[GroupIndex];
+ break;
+ }
+ }
}
+
+ // The input opcode does not match with any of the opcodes from the tables.
+ // The unsupported FMA opcode must be added to one of the two opcode groups
+ // defined above.
+ assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (IsIntrinOpcode && SrcOpIdx1 == 1)
+ return 0;
+
+ unsigned Case;
+ if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+ Case = 0;
+ else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+ Case = 1;
+ else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+ Case = 2;
+ else
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FoundOpcodesGroup[FormIndex];
}
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {
case X86::CMPPDrri:
@@ -3141,46 +3631,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {
- case 0x00: // EQUAL
- case 0x03: // UNORDERED
- case 0x04: // NOT EQUAL
- case 0x07: // ORDERED
- SrcOpIdx1 = 1;
- SrcOpIdx2 = 2;
- return true;
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ // The indices of the commutable operands are 1 and 2.
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}
return false;
}
- case X86::VFMADDPDr231r:
- case X86::VFMADDPSr231r:
- case X86::VFMADDSDr231r:
- case X86::VFMADDSSr231r:
- case X86::VFMSUBPDr231r:
- case X86::VFMSUBPSr231r:
- case X86::VFMSUBSDr231r:
- case X86::VFMSUBSSr231r:
- case X86::VFNMADDPDr231r:
- case X86::VFNMADDPSr231r:
- case X86::VFNMADDSDr231r:
- case X86::VFNMADDSSr231r:
- case X86::VFNMSUBPDr231r:
- case X86::VFNMSUBPSr231r:
- case X86::VFNMSUBSDr231r:
- case X86::VFNMSUBSSr231r:
- case X86::VFMADDPDr231rY:
- case X86::VFMADDPSr231rY:
- case X86::VFMSUBPDr231rY:
- case X86::VFMSUBPSr231rY:
- case X86::VFNMADDPDr231rY:
- case X86::VFNMADDPSr231rY:
- case X86::VFNMSUBPDr231rY:
- case X86::VFNMSUBPSr231rY:
- SrcOpIdx1 = 2;
- SrcOpIdx2 = 3;
- return true;
default:
+ if (isFMA3(MI->getOpcode()))
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}
+ return false;
}
static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
@@ -3821,15 +4287,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return 0;
}
-inline static bool MaskRegClassContains(unsigned Reg) {
+static bool MaskRegClassContains(unsigned Reg) {
return X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg) ||
X86::VK32RegClass.contains(Reg) ||
X86::VK64RegClass.contains(Reg) ||
X86::VK1RegClass.contains(Reg);
}
+
+static bool GRRegClassContains(unsigned Reg) {
+ return X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg) ||
+ X86::GR16RegClass.contains(Reg) ||
+ X86::GR8RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVBrk;
+ }
+ if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVBkr;
+ }
+ return 0;
+}
+
static
-unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
+ return X86::KMOVQkk;
+ if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
+ return X86::KMOVDrk;
+ if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
+ return X86::KMOVQrk;
+ if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+ return X86::KMOVDkr;
+ if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
+ return X86::KMOVQkr;
+ return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
+ const X86Subtarget &Subtarget)
+{
+ if (Subtarget.hasDQI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
+ return Opc;
+ if (Subtarget.hasBWI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
+ return Opc;
if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
X86::VR256XRegClass.contains(DestReg, SrcReg) ||
X86::VR512RegClass.contains(DestReg, SrcReg)) {
@@ -3837,21 +4346,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
SrcReg = get512BitSuperRegister(SrcReg);
return X86::VMOVAPSZrr;
}
- if (MaskRegClassContains(DestReg) &&
- MaskRegClassContains(SrcReg))
+ if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
return X86::KMOVWkk;
- if (MaskRegClassContains(DestReg) &&
- (X86::GR32RegClass.contains(SrcReg) ||
- X86::GR16RegClass.contains(SrcReg) ||
- X86::GR8RegClass.contains(SrcReg))) {
- SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
+ if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
return X86::KMOVWkr;
}
- if ((X86::GR32RegClass.contains(DestReg) ||
- X86::GR16RegClass.contains(DestReg) ||
- X86::GR8RegClass.contains(DestReg)) &&
- MaskRegClassContains(SrcReg)) {
- DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
+ if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
return X86::KMOVWrk;
}
return 0;
@@ -3886,7 +4388,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
else if (HasAVX512)
- Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+ Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
else if (X86::VR128RegClass.contains(DestReg, SrcReg))
Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
@@ -3900,34 +4402,86 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- // Moving EFLAGS to / from another register requires a push and a pop.
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
- if (SrcReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF64));
- BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int PopF = is64 ? X86::POPF64 : X86::POPF32;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ if (!Subtarget.hasLAHFSAHF()) {
+ assert(Subtarget.is64Bit() &&
+ "Not having LAHF/SAHF only happens on 64-bit.");
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - usesTheStack.
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(PushF));
+ BuildMI(MBB, MI, DL, get(Pop), DestReg);
+ }
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Push))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(PopF));
+ }
return;
}
- if (X86::GR32RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF32));
- BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
- return;
+
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - usesTheStack.
+
+
+ bool AXDead = (Reg == AX) ||
+ (MachineBasicBlock::LQR_Dead ==
+ MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+ if (!AXDead) {
+ // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may
+ // actually be dead. This is not a problem for correctness as we are just
+ // (unnecessarily) saving+restoring a dead register. However the
+ // MachineVerifier expects operands that read from dead registers
+ // to be marked with the "undef" flag.
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
}
- }
- if (DestReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH64r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF64));
- return;
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
}
- if (X86::GR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH32r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF32));
- return;
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
}
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
}
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
@@ -4602,9 +5156,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
// live-out. If it is live-out, do not optimize.
if ((IsCmpZero || IsSwapped) && !IsSafe) {
MachineBasicBlock *MBB = CmpInstr->getParent();
- for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
- SE = MBB->succ_end(); SI != SE; ++SI)
- if ((*SI)->isLiveIn(X86::EFLAGS))
+ for (MachineBasicBlock *Successor : MBB->successors())
+ if (Successor->isLiveIn(X86::EFLAGS))
return false;
}
@@ -4645,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
CmpInstr->eraseFromParent();
// Modify the condition code of instructions in OpsToUpdate.
- for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
- OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+ for (auto &Op : OpsToUpdate)
+ Op.first->setDesc(get(Op.second));
return true;
}
@@ -4694,8 +5247,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
return nullptr;
// Check whether we can fold the def into SrcOperandId.
- MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI);
- if (FoldMI) {
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
@@ -4725,6 +5277,82 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
return true;
}
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+/// %k4 = K_SET1
+/// to:
+/// %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc, unsigned Reg) {
+ assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+ MIB->setDesc(Desc);
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+ bool MinusOne) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ unsigned Reg = MIB->getOperand(0).getReg();
+
+ // Insert the XOR.
+ BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+
+ // Turn the pseudo into an INC or DEC.
+ MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+ MIB.addReg(Reg);
+
+ return true;
+}
+
+bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ DebugLoc DL = MIB->getDebugLoc();
+ int64_t Imm = MIB->getOperand(1).getImm();
+ assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+ MachineBasicBlock::iterator I = MIB.getInstr();
+
+ int StackAdjustment;
+
+ if (Subtarget.is64Bit()) {
+ assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+ MIB->getOpcode() == X86::MOV32ImmSExti8);
+ // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+ // widen the register if necessary.
+ StackAdjustment = 8;
+ BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
+ MIB->setDesc(get(X86::POP64r));
+ MIB->getOperand(0)
+ .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+ } else {
+ assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+ StackAdjustment = 4;
+ BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
+ MIB->setDesc(get(X86::POP32r));
+ }
+
+ // Build CFI if necessary.
+ MachineFunction &MF = *MBB.getParent();
+ const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+ bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool NeedsDwarfCFI =
+ !IsWin64Prologue &&
+ (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+ bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+ if (EmitCFI) {
+ TFL->BuildCFI(MBB, I, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+ TFL->BuildCFI(MBB, std::next(I), DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+ }
+
+ return true;
+}
+
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
// code sequence is needed for other targets.
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@@ -4735,8 +5363,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
- MachineMemOperand *MMO = MBB.getParent()->
- getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -4753,6 +5381,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
switch (MI->getOpcode()) {
case X86::MOV32r0:
return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+ case X86::MOV32r1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+ case X86::MOV32r_1:
+ return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+ case X86::MOV32ImmSExti8:
+ case X86::MOV64ImmSExti8:
+ return ExpandMOVImmSExti8(MIB);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
@@ -4777,10 +5412,22 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
case X86::TEST8ri_NOREX:
MI->setDesc(get(X86::TEST8ri));
return true;
+
+ // KNL does not recognize dependency-breaking idioms for mask registers,
+ // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+ // Using %k0 as the undef input register is a performance heuristic based
+ // on the assumption that %k0 is used less frequently than the other mask
+ // registers, since it is not usable as a write mask.
+ // FIXME: A more advanced approach would be to choose the best input mask
+ // register based on context.
case X86::KSET0B:
- case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+ case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+ case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+ case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
case X86::KSET1B:
- case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
+ case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+ case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+ case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
case TargetOpcode::LOAD_STACK_GUARD:
expandLoadStackGuard(MIB, *this);
return true;
@@ -4788,12 +5435,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return false;
}
-static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();
- for (unsigned i = 0; i != NumAddrOps; ++i)
- MIB.addOperand(MOs[i]);
- if (NumAddrOps < 4) // FrameIndex only
- addOffset(MIB, 0);
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ MIB.addDisp(MO, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ }
}
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
@@ -4828,7 +5491,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
unsigned OpNo, ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
- MachineInstr *MI, const TargetInstrInfo &TII) {
+ MachineInstr *MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
// Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
@@ -4838,7 +5502,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
MachineOperand &MO = MI->getOperand(i);
if (i == OpNo) {
assert(MO.isReg() && "Expected to fold into reg operand!");
- addOperands(MIB, MOs);
+ addOperands(MIB, MOs, PtrOffset);
} else {
MIB.addOperand(MO);
}
@@ -4860,6 +5524,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
return MIB.addImm(0);
}
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const {
+ switch (MI->getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 4 <= Align) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+ : X86::INSERTPSrm);
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ };
+
+ return nullptr;
+}
+
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
@@ -4869,10 +5567,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
bool isCallRegIndirect = Subtarget.callRegIndirect();
bool isTwoAddrFold = false;
- // For CPUs that favor the register form of a call,
- // do not fold loads into calls.
- if (isCallRegIndirect &&
- (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+ MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+ MI->getOpcode() == X86::PUSH64r))
return nullptr;
unsigned NumOps = MI->getDesc().getNumOperands();
@@ -4886,6 +5587,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
return nullptr;
MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (MachineInstr *CustomMI =
+ foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ return CustomMI;
+
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
@@ -4963,60 +5670,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// If the instruction and target operand are commutable, commute the
// instruction and try again.
if (AllowCommute) {
- unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI->getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
- bool Tied0 =
- 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied1 =
+ 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
// If either of the commutable operands are tied to the destination
// then we can not commute + fold.
- if ((HasDef && Reg0 == Reg1 && Tied0) ||
- (HasDef && Reg0 == Reg2 && Tied1))
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
return nullptr;
- if ((CommuteOpIdx1 == OriginalOpIdx) ||
- (CommuteOpIdx2 == OriginalOpIdx)) {
- MachineInstr *CommutedMI = commuteInstruction(MI, false);
- if (!CommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (CommutedMI != MI) {
- // New instruction. We can't fold from this.
- CommutedMI->eraseFromParent();
- return nullptr;
- }
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
- // Attempt to fold with the commuted version of the instruction.
- unsigned CommuteOp =
- (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
- NewMI =
- foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
- /*AllowCommute=*/false);
- if (NewMI)
- return NewMI;
-
- // Folding failed again - undo the commute before returning.
- MachineInstr *UncommutedMI = commuteInstruction(MI, false);
- if (!UncommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (UncommutedMI != MI) {
- // New instruction. It doesn't need to be kept.
- UncommutedMI->eraseFromParent();
- return nullptr;
- }
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+ Size, Align, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
- // Return here to prevent duplicate fuse failure report.
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
return nullptr;
}
+ if (UncommutedMI != MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
}
}
@@ -5208,13 +5911,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
// If MI kills this register, the false dependence is already broken.
if (MI->killsRegister(Reg, TRI))
return;
+
if (X86::VR128RegClass.contains(Reg)) {
// These instructions are all floating point domain, so xorps is the best
// choice.
- bool HasAVX = Subtarget.hasAVX();
- unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ MI->addRegisterKilled(Reg, TRI, true);
} else if (X86::VR256RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.
@@ -5222,21 +5926,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
- } else
- return;
- MI->addRegisterKilled(Reg, TRI, true);
+ MI->addRegisterKilled(Reg, TRI, true);
+ }
}
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
// Check switch flag
- if (NoFusing) return nullptr;
+ if (NoFusing)
+ return nullptr;
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -5303,6 +6006,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
return false;
default:
return true;
@@ -5318,6 +6027,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+ case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
+ case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
+ case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
return false;
default:
return true;
@@ -5342,10 +6057,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// Check switch flag
if (NoFusing) return nullptr;
- // Unless optimizing for size, don't fold to avoid partial
- // register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
// Determine the alignment of the load.
@@ -5460,62 +6173,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
-bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const {
- // Check switch flag
- if (NoFusing) return 0;
-
- if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
- switch (MI->getOpcode()) {
- default: return false;
- case X86::TEST8rr:
- case X86::TEST16rr:
- case X86::TEST32rr:
- case X86::TEST64rr:
- return true;
- case X86::ADD32ri:
- // FIXME: AsmPrinter doesn't know how to handle
- // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
- if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
- return false;
- break;
- }
- }
-
- if (Ops.size() != 1)
- return false;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- unsigned NumOps = MI->getDesc().getNumOperands();
- bool isTwoAddr = NumOps > 1 &&
- MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
- // Folding a memory location into the two-address part of a two-address
- // instruction is different than folding it other places. It requires
- // replacing the *two* registers with the memory location.
- const DenseMap<unsigned,
- std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
- if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2Addr;
- } else if (OpNum == 0) {
- if (Opc == X86::MOV32r0)
- return true;
-
- OpcodeTablePtr = &RegOp2MemOpTable0;
- } else if (OpNum == 1) {
- OpcodeTablePtr = &RegOp2MemOpTable1;
- } else if (OpNum == 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2;
- } else if (OpNum == 3) {
- OpcodeTablePtr = &RegOp2MemOpTable3;
- }
-
- if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
- return true;
- return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
-}
-
bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
@@ -5536,9 +6193,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
const MCInstrDesc &MCID = get(Opc);
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ // TODO: Check if 32-byte or greater accesses are slow too?
if (!MI->hasOneMemOperand() &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
// conservatively assume the address is unaligned. That's bad for
// performance.
@@ -5582,20 +6240,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
if (FoldedStore)
MIB.addReg(Reg, RegState::Define);
- for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
- MIB.addOperand(BeforeOps[i]);
+ for (MachineOperand &BeforeOp : BeforeOps)
+ MIB.addOperand(BeforeOp);
if (FoldedLoad)
MIB.addReg(Reg);
- for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
- MIB.addOperand(AfterOps[i]);
- for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
- MachineOperand &MO = ImpOps[i];
- MIB.addReg(MO.getReg(),
- getDefRegState(MO.isDef()) |
+ for (MachineOperand &AfterOp : AfterOps)
+ MIB.addOperand(AfterOp);
+ for (MachineOperand &ImpOp : ImpOps) {
+ MIB.addReg(ImpOp.getReg(),
+ getDefRegState(ImpOp.isDef()) |
RegState::Implicit |
- getKillRegState(MO.isKill()) |
- getDeadRegState(MO.isDead()) |
- getUndefRegState(MO.isUndef()));
+ getKillRegState(ImpOp.isKill()) |
+ getDeadRegState(ImpOp.isDead()) |
+ getUndefRegState(ImpOp.isUndef()));
}
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
switch (DataMI->getOpcode()) {
@@ -5686,9 +6343,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned load.
return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
@@ -5729,9 +6388,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned store.
return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
@@ -6192,16 +6853,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
// domains, but they require a bit more work than just switching opcodes.
static const uint16_t *lookup(unsigned opcode, unsigned domain) {
- for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
- if (ReplaceableInstrs[i][domain-1] == opcode)
- return ReplaceableInstrs[i];
+ for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+ if (Row[domain-1] == opcode)
+ return Row;
return nullptr;
}
static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
- for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
- if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
- return ReplaceableInstrsAVX2[i];
+ for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+ if (Row[domain-1] == opcode)
+ return Row;
return nullptr;
}
@@ -6347,230 +7008,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel,
return isHighLatencyDef(DefMI->getOpcode());
}
-static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst,
- const MachineBasicBlock *MBB) {
- assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators");
- const MachineOperand &Op1 = Inst.getOperand(1);
- const MachineOperand &Op2 = Inst.getOperand(2);
- const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
- // We need virtual register definitions.
- MachineInstr *MI1 = nullptr;
- MachineInstr *MI2 = nullptr;
- if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
- MI1 = MRI.getUniqueVRegDef(Op1.getReg());
- if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
- MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-
- // And they need to be in the trace (otherwise, they won't have a depth).
- if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
- return true;
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const {
+ assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+ "Reassociation needs binary operators");
+
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ if (Inst.getNumOperands() == 4) {
+ assert(Inst.getOperand(3).isReg() &&
+ Inst.getOperand(3).getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ if (!Inst.getOperand(3).isDead())
+ return false;
+ }
- return false;
-}
-
-static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
- const MachineBasicBlock *MBB = Inst.getParent();
- const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
- MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
- unsigned AssocOpcode = Inst.getOpcode();
-
- // If only one operand has the same opcode and it's the second source operand,
- // the operands must be commuted.
- Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
- if (Commuted)
- std::swap(MI1, MI2);
-
- // 1. The previous instruction must be the same type as Inst.
- // 2. The previous instruction must have virtual register definitions for its
- // operands in the same basic block as Inst.
- // 3. The previous instruction's result must only be used by Inst.
- if (MI1->getOpcode() == AssocOpcode &&
- hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
- MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
- return true;
-
- return false;
+ return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
}
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
-// 2. Other math / logic operations (and, or)
-static bool isAssociativeAndCommutative(unsigned Opcode) {
- switch (Opcode) {
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::AND8rr:
+ case X86::AND16rr:
+ case X86::AND32rr:
+ case X86::AND64rr:
+ case X86::OR8rr:
+ case X86::OR16rr:
+ case X86::OR32rr:
+ case X86::OR64rr:
+ case X86::XOR8rr:
+ case X86::XOR16rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ case X86::PANDrr:
+ case X86::PORrr:
+ case X86::PXORrr:
+ case X86::VPANDrr:
+ case X86::VPANDYrr:
+ case X86::VPORrr:
+ case X86::VPORYrr:
+ case X86::VPXORrr:
+ case X86::VPXORYrr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MAXCPDrr:
+ case X86::MAXCPSrr:
+ case X86::MAXCSDrr:
+ case X86::MAXCSSrr:
+ case X86::MINCPDrr:
+ case X86::MINCPSrr:
+ case X86::MINCSDrr:
+ case X86::MINCSSrr:
+ case X86::VMAXCPDrr:
+ case X86::VMAXCPSrr:
+ case X86::VMAXCPDYrr:
+ case X86::VMAXCPSYrr:
+ case X86::VMAXCSDrr:
+ case X86::VMAXCSSrr:
+ case X86::VMINCPDrr:
+ case X86::VMINCPSrr:
+ case X86::VMINCPDYrr:
+ case X86::VMINCPSYrr:
+ case X86::VMINCSDrr:
+ case X86::VMINCSSrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
case X86::ADDSDrr:
case X86::ADDSSrr:
- case X86::VADDSDrr:
- case X86::VADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
case X86::MULSDrr:
case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
case X86::VMULSDrr:
case X86::VMULSSrr:
- return true;
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
default:
return false;
}
}
-/// Return true if the input instruction is part of a chain of dependent ops
-/// that are suitable for reassociation, otherwise return false.
-/// If the instruction's operands must be commuted to have a previous
-/// instruction of the same type define the first source operand, Commuted will
-/// be set to true.
-static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) {
- // 1. The operation must be associative and commutative.
- // 2. The instruction must have virtual register definitions for its
- // operands in the same basic block.
- // 3. The instruction must have a reassociable sibling.
- if (isAssociativeAndCommutative(Inst.getOpcode()) &&
- hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
- hasReassocSibling(Inst, Commuted))
- return true;
-
- return false;
-}
-
-// FIXME: This has the potential to be expensive (compile time) while not
-// improving the code at all. Some ways to limit the overhead:
-// 1. Track successful transforms; bail out if hit rate gets too low.
-// 2. Only enable at -O3 or some other non-default optimization level.
-// 3. Pre-screen pattern candidates here: if an operand of the previous
-// instruction is known to not increase the critical path, then don't match
-// that pattern.
-bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
- if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
- return false;
-
- // TODO: There is nothing x86-specific here except the instruction type.
- // This logic could be hoisted into the machine combiner pass itself.
-
- // Look for this reassociation pattern:
- // B = A op X (Prev)
- // C = B op Y (Root)
-
- bool Commute;
- if (isReassocCandidate(Root, Commute)) {
- // We found a sequence of instructions that may be suitable for a
- // reassociation of operands to increase ILP. Specify each commutation
- // possibility for the Prev instruction in the sequence and let the
- // machine combiner decide if changing the operands is worthwhile.
- if (Commute) {
- Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
- Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
- } else {
- Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
- Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
- }
- return true;
- }
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+ MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const {
+ // Integer instructions define an implicit EFLAGS source register operand as
+ // the third source (fourth total) operand.
+ if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+ return;
- return false;
+ assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ "Unexpected instruction type for reassociation");
+
+ MachineOperand &OldOp1 = OldMI1.getOperand(3);
+ MachineOperand &OldOp2 = OldMI2.getOperand(3);
+ MachineOperand &NewOp1 = NewMI1.getOperand(3);
+ MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+ assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+ assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ (void)OldOp1;
+ (void)OldOp2;
+
+ assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewOp1.setIsDead();
+ NewOp2.setIsDead();
}
-/// Attempt the following reassociation to reduce critical path length:
-/// B = A op X (Prev)
-/// C = B op Y (Root)
-/// ===>
-/// B = X op Y
-/// C = A op B
-static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
- MachineCombinerPattern::MC_PATTERN Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
- MachineFunction *MF = Root.getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
-
- // This array encodes the operand index for each parameter because the
- // operands may be commuted. Each row corresponds to a pattern value,
- // and each column specifies the index of A, B, X, Y.
- unsigned OpIdx[4][4] = {
- { 1, 1, 2, 2 },
- { 1, 2, 2, 1 },
- { 2, 1, 1, 2 },
- { 2, 2, 1, 1 }
- };
-
- MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]);
- MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]);
- MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
- MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
- MachineOperand &OpC = Root.getOperand(0);
-
- unsigned RegA = OpA.getReg();
- unsigned RegB = OpB.getReg();
- unsigned RegX = OpX.getReg();
- unsigned RegY = OpY.getReg();
- unsigned RegC = OpC.getReg();
-
- if (TargetRegisterInfo::isVirtualRegister(RegA))
- MRI.constrainRegClass(RegA, RC);
- if (TargetRegisterInfo::isVirtualRegister(RegB))
- MRI.constrainRegClass(RegB, RC);
- if (TargetRegisterInfo::isVirtualRegister(RegX))
- MRI.constrainRegClass(RegX, RC);
- if (TargetRegisterInfo::isVirtualRegister(RegY))
- MRI.constrainRegClass(RegY, RC);
- if (TargetRegisterInfo::isVirtualRegister(RegC))
- MRI.constrainRegClass(RegC, RC);
-
- // Create a new virtual register for the result of (X op Y) instead of
- // recycling RegB because the MachineCombiner's computation of the critical
- // path requires a new register definition rather than an existing one.
- unsigned NewVR = MRI.createVirtualRegister(RC);
- InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
-
- unsigned Opcode = Root.getOpcode();
- bool KillA = OpA.isKill();
- bool KillX = OpX.isKill();
- bool KillY = OpY.isKill();
-
- // Create new instructions for insertion.
- MachineInstrBuilder MIB1 =
- BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
- .addReg(RegX, getKillRegState(KillX))
- .addReg(RegY, getKillRegState(KillY));
- InsInstrs.push_back(MIB1);
-
- MachineInstrBuilder MIB2 =
- BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
- .addReg(RegA, getKillRegState(KillA))
- .addReg(NewVR, getKillRegState(true));
- InsInstrs.push_back(MIB2);
-
- // Record old instructions for deletion.
- DelInstrs.push_back(&Prev);
- DelInstrs.push_back(&Root);
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
}
-void X86InstrInfo::genAlternativeCodeSequence(
- MachineInstr &Root,
- MachineCombinerPattern::MC_PATTERN Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
- MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
-
- // Select the previous instruction in the sequence based on the input pattern.
- MachineInstr *Prev = nullptr;
- switch (Pattern) {
- case MachineCombinerPattern::MC_REASSOC_AX_BY:
- case MachineCombinerPattern::MC_REASSOC_XA_BY:
- Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- break;
- case MachineCombinerPattern::MC_REASSOC_AX_YB:
- case MachineCombinerPattern::MC_REASSOC_XA_YB:
- Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
- }
- assert(Prev && "Unknown pattern for machine combiner");
-
- reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
- return;
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_STUB, "x86-darwin-stub"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"}};
+ return makeArrayRef(TargetFlags);
}
namespace {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index bf63336c7005..9d40334206b2 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -23,22 +23,10 @@
#include "X86GenInstrInfo.inc"
namespace llvm {
+ class MachineInstrBuilder;
class X86RegisterInfo;
class X86Subtarget;
- namespace MachineCombinerPattern {
- enum MC_PATTERN : int {
- // These are commutative variants for reassociating a computation chain
- // of the form:
- // B = A op X (Prev)
- // C = B op Y (Root)
- MC_REASSOC_AX_BY = 0,
- MC_REASSOC_AX_YB = 1,
- MC_REASSOC_XA_BY = 2,
- MC_REASSOC_XA_YB = 3,
- };
- } // end namespace MachineCombinerPattern
-
namespace X86 {
// X86 specific condition code. These correspond to X86_*_COND in
// X86InstrInfo.td. They must be kept in synch.
@@ -259,14 +247,64 @@ public:
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const override;
- /// commuteInstruction - We have a few instructions that must be hacked on to
- /// commute them.
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
///
- MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
-
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
+ /// Returns true if the routine could find two commutable operands
+ /// in the given FMA instruction. Otherwise, returns false.
+ ///
+ /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+ /// The output indices of the commuted operands are returned in these
+ /// arguments. Also, the input values of these arguments may be preset either
+ /// to indices of operands that must be commuted or be equal to a special
+ /// value 'CommuteAnyOperandIndex' which means that the corresponding
+ /// operand index is not set and this method is free to pick any of
+ /// available commutable operands.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+ /// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+ /// can be interpreted as a query asking if the operand #1 can be swapped
+ /// with any other available operand (e.g. operand #2, operand #3, etc.).
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ bool findFMA3CommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const;
+
+ /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+ /// performs the same computations as the given MI but which has the operands
+ /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+ /// It may return 0 if it is unsafe to commute the operands.
+ ///
+ /// The returned FMA opcode may differ from the opcode in the given \p MI.
+ /// For example, commuting the operands #1 and #3 in the following FMA
+ /// FMA213 #1, #2, #3
+ /// results into instruction with adjusted opcode:
+ /// FMA231 #3, #2, #1
+ unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) const;
+
// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -342,11 +380,6 @@ public:
MachineBasicBlock::iterator InsertPt,
MachineInstr *LoadMI) const override;
- /// canFoldMemoryOperand - Returns true if the specified load / store is
- /// folding is possible.
- bool canFoldMemoryOperand(const MachineInstr *,
- ArrayRef<unsigned>) const override;
-
/// unfoldMemoryOperand - Separate a single instruction which folded a load or
/// a store or a load and a store into two or more instruction. If this is
/// possible, returns true as well as the new instructions by reference.
@@ -406,10 +439,9 @@ public:
bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
- static bool isX86_64ExtendedReg(const MachineOperand &MO) {
- if (!MO.isReg()) return false;
- return X86II::isX86_64ExtendedReg(MO.getReg());
- }
+ /// True if MI has a condition code def, e.g. EFLAGS, that is
+ /// not marked dead.
+ bool hasLiveCondCodeDef(MachineInstr *MI) const;
/// getGlobalBaseReg - Return a virtual register initialized with the
/// the global base register value. Output instructions required to
@@ -452,26 +484,19 @@ public:
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI,
unsigned UseIdx) const override;
-
bool useMachineCombiner() const override {
return true;
}
-
- /// Return true when there is potentially a faster code sequence
- /// for an instruction chain ending in <Root>. All potential patterns are
- /// output in the <Pattern> array.
- bool getMachineCombinerPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override;
-
- /// When getMachineCombinerPatterns() finds a pattern, this function generates
- /// the instructions that could replace the original code sequence.
- void genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
+ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+ bool hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const override;
+
+ void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const override;
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2 if having two register operands, and the value it
@@ -500,16 +525,49 @@ public:
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+
+protected:
+ /// Commutes the operands in the given instruction by changing the operands
+ /// order and/or changing the instruction's opcode and/or the immediate value
+ /// operand.
+ ///
+ /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+ /// to be commuted.
+ ///
+ /// Do not call this method for a non-commutable instruction or
+ /// non-commutable operands.
+ /// Even though the instruction is commutable, the method may still
+ /// fail to commute the operands, null pointer is returned in such cases.
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+ unsigned CommuteOpIdx1,
+ unsigned CommuteOpIdx2) const override;
+
private:
MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const;
+ /// Handles memory folding for special case instructions, for instance those
+ /// requiring custom manipulation of the address.
+ MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const;
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
int &FrameIndex) const;
+
+ /// Expand the MOVImmSExti8 pseudo-instructions.
+ bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 52bab9c79b45..f4ca2b880bad 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -106,8 +106,6 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
-def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
-
def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -158,6 +156,8 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInGlue]>;
def X86vastart_save_xmm_regs :
SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
@@ -250,9 +250,6 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
- [SDNPHasChain, SDNPOutGlue]>;
-
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -344,18 +341,21 @@ def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>;
def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>;
def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>;
-// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
-// plain GR64, so that it doesn't potentially require a REX prefix.
-def i8mem_NOREX : Operand<i64> {
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
let PrintMethod = "printi8mem";
- let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+ let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm);
let ParserMatchClass = X86Mem8AsmOperand;
let OperandType = "OPERAND_MEMORY";
}
// GPRs available for tailcall.
// It represents GR32_TC, GR64_TC or GR64_TCW64.
-def ptr_rc_tailcall : PointerLikeRegClass<2>;
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
// Special i32mem for addresses of load folding tail calls. These are not
// allowed to use callee-saved registers since they must be scheduled
@@ -697,34 +697,34 @@ def lea64mem : Operand<i64> {
// X86 Complex Pattern Definitions.
//
-// Define X86 specific addressing mode.
-def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
-def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+// Define X86-specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
[add, sub, mul, X86mul_imm, shl, or, frameindex],
[]>;
// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
-def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
[add, sub, mul, X86mul_imm, shl, or,
frameindex, X86WrapperRIP],
[]>;
-def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
[tglobaltlsaddr], []>;
-def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
[tglobaltlsaddr], []>;
-def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
[add, sub, mul, X86mul_imm, shl, or, frameindex,
X86WrapperRIP], []>;
-def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
[tglobaltlsaddr], []>;
-def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
[tglobaltlsaddr], []>;
-def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>;
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
//===----------------------------------------------------------------------===//
// X86 Instruction Predicate Definitions.
@@ -767,12 +767,21 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">,
def NoDQI : Predicate<"!Subtarget->hasDQI()">;
def HasBWI : Predicate<"Subtarget->hasBWI()">,
AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI : Predicate<"!Subtarget->hasBWI()">;
def HasVLX : Predicate<"Subtarget->hasVLX()">,
AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
def NoVLX : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU : Predicate<"!Subtarget->hasPKU()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
@@ -794,6 +803,7 @@ def HasSHA : Predicate<"Subtarget->hasSHA()">;
def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasMPX : Predicate<"Subtarget->hasMPX()">;
@@ -812,6 +822,8 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">,
AssemblerPredicate<"Mode32Bit", "32-bit mode">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+ "Subtarget->getFrameLowering()->hasFP(*MF)">;
def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
@@ -825,6 +837,7 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
def OptForSize : Predicate<"OptForSize">;
+def OptForMinSize : Predicate<"OptForMinSize">;
def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
@@ -867,20 +880,54 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{
}]>;
-def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
-def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
-def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
-def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
// unsigned field.
-def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
def i64immZExt32SExt8 : ImmLeaf<i64, [{
- return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
+ return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
}]>;
// Helper fragments for loads.
@@ -914,11 +961,12 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
return false;
}]>;
-def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
-def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
-def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
-def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
-def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1020,12 +1068,8 @@ def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
IIC_PUSH_REG>, OpSize16;
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
- IIC_PUSH_MEM>, OpSize16;
def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
- IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
"push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
@@ -1039,6 +1083,14 @@ def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
"push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
Requires<[Not64BitMode]>;
} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+ IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+ IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
}
let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
@@ -1071,9 +1123,11 @@ def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
-} // mayStore, SchedRW
+} // mayLoad, mayStore, SchedRW
}
let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
@@ -1275,13 +1329,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+ [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+ [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+ [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
@@ -1457,10 +1511,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
let SchedRW = [WriteALU] in {
let Defs = [EFLAGS], Uses = [AH] in
def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
- [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+ [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+ Requires<[HasLAHFSAHF]>;
let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [],
- IIC_AHF>; // AH = flags
+ IIC_AHF>, // AH = flags
+ Requires<[HasLAHFSAHF]>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -1894,37 +1950,38 @@ def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
}
// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
Sched<[WriteLoad]>;
let SchedRW = [WriteMicrocoded] in {
// ASCII Adjust After Addition
-// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
Requires<[Not64BitMode]>;
// ASCII Adjust AX Before Division
-// sets AL, AH and EFLAGS and uses AL and AH
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
"aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
// ASCII Adjust AX After Multiply
-// sets AL, AH and EFLAGS and uses AL
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
"aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
// ASCII Adjust AL After Subtraction - sets
-// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
Requires<[Not64BitMode]>;
// Decimal Adjust AL after Addition
-// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
Requires<[Not64BitMode]>;
// Decimal Adjust AL after Subtraction
-// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
Requires<[Not64BitMode]>;
} // SchedRW
@@ -2357,6 +2414,32 @@ defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
} // HasTBM, EFLAGS
//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [],
+ IIC_SSE_MONITOR>, TB;
+let Uses = [ECX, EAX, EBX] in
+def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>,
+ TB;
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+ Requires<[Not64BitMode]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
//===----------------------------------------------------------------------===//
@@ -2498,8 +2581,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"loopz", "loope", "att">;
-def : MnemonicAlias<"loopnz", "loopne", "att">;
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>;
@@ -2532,14 +2615,15 @@ def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"repe", "rep", "att">;
-def : MnemonicAlias<"repz", "rep", "att">;
-def : MnemonicAlias<"repnz", "repne", "att">;
+def : MnemonicAlias<"repe", "rep">;
+def : MnemonicAlias<"repz", "rep">;
+def : MnemonicAlias<"repnz", "repne">;
def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sal", "shl", "intel">;
def : MnemonicAlias<"salb", "shlb", "att">;
def : MnemonicAlias<"salw", "shlw", "att">;
def : MnemonicAlias<"sall", "shll", "att">;
@@ -2579,14 +2663,14 @@ def : MnemonicAlias<"fcmova", "fcmovnbe", "att">;
def : MnemonicAlias<"fcmovnae", "fcmovb", "att">;
def : MnemonicAlias<"fcmovna", "fcmovbe", "att">;
def : MnemonicAlias<"fcmovae", "fcmovnb", "att">;
-def : MnemonicAlias<"fcomip", "fcompi", "att">;
+def : MnemonicAlias<"fcomip", "fcompi">;
def : MnemonicAlias<"fildq", "fildll", "att">;
def : MnemonicAlias<"fistpq", "fistpll", "att">;
def : MnemonicAlias<"fisttpq", "fisttpll", "att">;
def : MnemonicAlias<"fldcww", "fldcw", "att">;
def : MnemonicAlias<"fnstcww", "fnstcw", "att">;
def : MnemonicAlias<"fnstsww", "fnstsw", "att">;
-def : MnemonicAlias<"fucomip", "fucompi", "att">;
+def : MnemonicAlias<"fucomip", "fucompi">;
def : MnemonicAlias<"fwait", "wait">;
def : MnemonicAlias<"fxsaveq", "fxsave64", "att">;
@@ -2594,7 +2678,9 @@ def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">;
def : MnemonicAlias<"xsaveq", "xsave64", "att">;
def : MnemonicAlias<"xrstorq", "xrstor64", "att">;
def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
-
+def : MnemonicAlias<"xrstorsq", "xrstors64", "att">;
+def : MnemonicAlias<"xsavecq", "xsavec64", "att">;
+def : MnemonicAlias<"xsavesq", "xsaves64", "att">;
class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
string VariantName>
@@ -2640,8 +2726,8 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
//===----------------------------------------------------------------------===//
// aad/aam default to base 10 if no operand is specified.
-def : InstAlias<"aad", (AAD8i8 10)>;
-def : InstAlias<"aam", (AAM8i8 10)>;
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
// Likewise for btc/btr/bts.
@@ -2719,8 +2805,10 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
// Various unary fpstack operations default to operating on on ST1.
// For example, "fxch" -> "fxch %st(1)"
def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>;
+def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>;
def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>;
def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>;
+def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>;
def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>;
def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>;
def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>;
@@ -2798,20 +2886,20 @@ def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16Bit
// "imul <imm>, B" is an alias for "imul <imm>, B, B".
-def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
-def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
-def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
-def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
-def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
-def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
// inb %dx -> inb %al, %dx
def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
-def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
-def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
-def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp
@@ -2861,9 +2949,9 @@ def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:
def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
-def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
-def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
-def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity
@@ -2940,3 +3028,34 @@ def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
(XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index eaa7894004cb..11dc1e7d466b 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -249,6 +249,7 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
(MMX_X86movd2w (x86mmx VR64:$src)))],
IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+let isBitcast = 1 in
def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst, (bitconvert GR64:$src))],
@@ -262,7 +263,7 @@ def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
// These are 64 bit moves, but since the OS X assembler doesn't
// recognize a register-register movq, we write them as
// movd.
-let SchedRW = [WriteMove] in {
+let SchedRW = [WriteMove], isBitcast = 1 in {
def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
(outs GR64:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
@@ -303,7 +304,7 @@ def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
(x86mmx (bitconvert
- (i64 (vector_extract (v2i64 VR128:$src),
+ (i64 (extractelt (v2i64 VR128:$src),
(iPTR 0))))))],
IIC_MMX_MOVQ_RR>;
@@ -326,6 +327,7 @@ def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
}
} // SchedRW
+let Predicates = [HasSSE1] in
def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movntq\t{$src, $dst|$dst, $src}",
[(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
@@ -355,6 +357,7 @@ defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
MMX_INTALU_ITINS, 1>;
defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
MMX_INTALUQ_ITINS, 1>;
defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
@@ -382,6 +385,7 @@ defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
MMX_INTALU_ITINS>;
defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
MMX_INTALUQ_ITINS>;
@@ -408,8 +412,10 @@ defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w,
MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
MMX_PMUL_ITINS, 1>;
let isCommutable = 1 in
@@ -422,6 +428,7 @@ defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
MMX_MISC_FUNC_ITINS, 1>;
defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
@@ -439,6 +446,7 @@ defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
MMX_PSADBW_ITINS, 1>;
+}
defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
MMX_MISC_FUNC_ITINS>;
@@ -594,6 +602,7 @@ let Constraints = "$src1 = $dst" in {
}
// Extract / Insert
+let Predicates = [HasSSE1] in
def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -601,6 +610,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
imm:$src2))],
IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
(outs VR64:$dst),
(ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
@@ -618,8 +628,10 @@ let Constraints = "$src1 = $dst" in {
imm:$src3))],
IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
+}
// Mask creation
+let Predicates = [HasSSE1] in
def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR64:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
@@ -639,12 +651,12 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
// Misc.
let SchedRW = [WriteShuffle] in {
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [HasSSE1,In32BitMode] in
def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
[(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
IIC_MMX_MASKMOV>;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
[(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
@@ -653,10 +665,6 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
// 64-bit bit convert.
let Predicates = [HasSSE2] in {
-def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
- (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
- (MMX_MOVD64from64rr VR64:$src)>;
def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
(MMX_MOVQ2FR64rr VR64:$src)>;
def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 99386b0658ad..7a44212bd829 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -330,9 +330,9 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
//===----------------------------------------------------------------------===//
// A vector extract of the first f32/f64 position is a subregister copy
-def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
(COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
-def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
(COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
// A 128-bit subvector extract from the first 256-bit vector position
@@ -413,6 +413,8 @@ let Predicates = [HasSSE2] in {
def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
+ def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
}
// Bitcasts between 256-bit vector types. Return the original type since
@@ -650,10 +652,10 @@ let Predicates = [UseAVX] in {
}
// Extract and store.
- def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
- def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
@@ -736,7 +738,7 @@ let Predicates = [UseSSE1] in {
}
// Extract and store.
- def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
@@ -770,7 +772,7 @@ let Predicates = [UseSSE2] in {
}
// Extract and store.
- def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
addr:$dst),
(MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
@@ -935,22 +937,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
}
-let Predicates = [HasAVX] in {
-def : Pat<(v8i32 (X86vzmovl
- (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v4i64 (X86vzmovl
- (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v8f32 (X86vzmovl
- (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-def : Pat<(v4f64 (X86vzmovl
- (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
-}
-
-
def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
@@ -1172,12 +1158,13 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
string base_opc, InstrItinClass itin> {
- defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ let Predicates = [UseAVX] in
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
itin>, VEX_4V;
-let Constraints = "$src1 = $dst" in
- defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
"\t{$src2, $dst|$dst, $src2}",
itin>;
}
@@ -1188,29 +1175,31 @@ let AddedComplexity = 20 in {
}
let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>, VEX;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract (v2f64 VR128:$src),
+ [(store (f64 (extractelt (v2f64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract (v2f64 VR128:$src),
+ [(store (f64 (extractelt (v2f64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>;
} // SchedRW
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
// Shuffle with VMOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
@@ -1243,7 +1232,7 @@ let Predicates = [HasAVX] in {
let Predicates = [UseSSE1] in {
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
- def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
(iPTR 0))), addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
@@ -1297,31 +1286,33 @@ let AddedComplexity = 20 in {
let SchedRW = [WriteStore] in {
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract
+ [(store (f64 (extractelt
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract
+ [(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract
+ [(store (f64 (extractelt
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract
+ [(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
} // SchedRW
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
// VMOVHPS patterns
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1345,7 +1336,7 @@ let Predicates = [HasAVX] in {
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(store (f64 (vector_extract
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
@@ -1377,7 +1368,7 @@ let Predicates = [UseSSE2] in {
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(store (f64 (vector_extract
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
@@ -2073,14 +2064,16 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
let Predicates = [HasAVX] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
(VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+ def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
(VCVTDQ2PSrm addr:$src)>;
+}
- def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
(VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
+ def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
(VCVTDQ2PSrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
@@ -2149,7 +2142,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
(VCVTTPD2DQYrr VR256:$src)>;
def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
@@ -2306,7 +2299,9 @@ let Predicates = [HasAVX] in {
(VCVTDQ2PSYrr VR256:$src)>;
def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
(VCVTDQ2PSYrm addr:$src)>;
+}
+let Predicates = [HasAVX, NoVLX] in {
// Match fround and fextend for 128/256-bit conversions
def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
(VCVTPD2PSrr VR128:$src)>;
@@ -2452,9 +2447,9 @@ let Defs = [EFLAGS] in {
defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
"ucomisd">, PD, VEX, VEX_LIG;
let Pattern = []<dag> in {
- defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+ defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
"comiss">, PS, VEX, VEX_LIG;
- defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+ defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
"comisd">, PD, VEX, VEX_LIG;
}
@@ -2475,9 +2470,9 @@ let Defs = [EFLAGS] in {
"ucomisd">, PD;
let Pattern = []<dag> in {
- defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+ defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
"comiss">, PS;
- defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+ defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
"comisd">, PD;
}
@@ -2605,19 +2600,20 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
Sched<[WriteFShuffle]>;
}
-defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+let Predicates = [HasAVX, NoVLX] in {
+ defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv4f32, SSEPackedSingle>, PS, VEX_4V;
-defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+ defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
-defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+ defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv2f64, SSEPackedDouble>, PD, VEX_4V;
-defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
-
+}
let Constraints = "$src1 = $dst" in {
defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -2627,7 +2623,7 @@ let Constraints = "$src1 = $dst" in {
memopv2f64, SSEPackedDouble>, PD;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (X86Shufp VR128:$src1,
(bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
(VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2694,6 +2690,7 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
}
+let Predicates = [HasAVX, NoVLX] in {
defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
SSEPackedSingle>, PS, VEX_4V;
@@ -2719,7 +2716,7 @@ defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
SSEPackedDouble>, PD, VEX_4V, VEX_L;
-
+}// Predicates = [HasAVX, NoVLX]
let Constraints = "$src1 = $dst" in {
defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
@@ -2845,8 +2842,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
ValueType OpVT128, ValueType OpVT256,
- OpndItins itins, bit IsCommutable = 0> {
-let Predicates = [HasAVX, NoVLX] in
+ OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
@@ -2854,7 +2851,7 @@ let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
memopv2i64, i128mem, itins, IsCommutable, 1>;
-let Predicates = [HasAVX2, NoVLX] in
+let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
OpVT256, VR256, loadv4i64, i256mem, itins,
IsCommutable, 0>, VEX_4V, VEX_L;
@@ -2863,13 +2860,13 @@ let Predicates = [HasAVX2, NoVLX] in
// These are ordered here for pattern ordering requirements with the fp versions
defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1>;
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1>;
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1>;
+ SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 0>;
+ SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Logical Instructions
@@ -2911,7 +2908,7 @@ let isCodeGenOnly = 1 in {
// Multiclass for vectors using the X86 logical operation aliases for FP.
multiclass sse12_fp_packed_vector_logical_alias<
bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
- let Predicates = [HasAVX, NoVLX] in {
+ let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
PS, VEX_4V;
@@ -2923,7 +2920,7 @@ multiclass sse12_fp_packed_vector_logical_alias<
defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
PS, VEX_4V, VEX_L;
-
+
defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
PD, VEX_4V, VEX_L;
@@ -3183,7 +3180,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE1] in {
// extracted scalar math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))))),
(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3198,7 +3195,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE41] in {
// extracted scalar math op with insert via blend
def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (i8 1))),
(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3215,7 +3212,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [HasAVX] in {
// extracted scalar math op with insert via blend
def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (i8 1))),
(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3241,7 +3238,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE2] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3256,7 +3253,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE41] in {
// extracted scalar math op with insert via blend
def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))), (i8 1))),
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3271,14 +3268,14 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [HasAVX] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
// extracted scalar math op with insert via blend
def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))), (i8 1))),
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
@@ -3449,8 +3446,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
/// sse1_fp_unop_p - SSE1 unops in packed form.
multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
-let Predicates = [HasAVX] in {
+ OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"ps\t{$src, $dst|$dst, $src}"),
@@ -3546,16 +3543,16 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Square root.
defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>;
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>;
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -4018,39 +4015,43 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
} // ExeDomain = SSEPackedInt
defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX>;
defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
- SSE_INTALUQ_ITINS_P, 1>;
+ SSE_INTALUQ_ITINS_P, 1, NoVLX>;
defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1>;
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1>;
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
- SSE_INTMUL_ITINS_P, 1>;
+ SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX>;
defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
- SSE_INTALUQ_ITINS_P, 0>;
+ SSE_INTALUQ_ITINS_P, 0, NoVLX>;
defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
// Intrinsic forms
defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
@@ -4067,26 +4068,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
-defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
- int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
-defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
- int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
-defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
- int_x86_avx2_psad_bw, SSE_PMADD, 1>;
-
-let Predicates = [HasAVX2] in
- def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
- (v32i8 VR256:$src2))),
- (VPSADBWYrr VR256:$src2, VR256:$src1)>;
let Predicates = [HasAVX] in
- def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (VPSADBWrr VR128:$src2, VR128:$src1)>;
-
-def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (PSADBWrr VR128:$src2, VR128:$src1)>;
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+ loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
let Predicates = [HasAVX] in
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
@@ -4105,9 +4098,6 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
//===---------------------------------------------------------------------===//
let Predicates = [HasAVX, NoVLX] in {
-defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
@@ -4115,9 +4105,6 @@ defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
@@ -4125,14 +4112,26 @@ defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX]
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
+ Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
// 128-bit logical shifts.
def VPSLLDQri : PDIi8<0x73, MRM7r,
(outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
@@ -4147,13 +4146,9 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
(v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
VEX_4V;
// PSRADQri doesn't exist in SSE[1-3].
-}
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
let Predicates = [HasAVX2, NoVLX] in {
-defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
@@ -4161,9 +4156,6 @@ defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
@@ -4171,14 +4163,25 @@ defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX]
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+ VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+ SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
+ Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// 256-bit logical shifts.
def VPSLLDQYri : PDIi8<0x73, MRM7r,
(outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
@@ -4193,8 +4196,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
(v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
VEX_4V, VEX_L;
// PSRADQYri doesn't exist in SSE[1-3].
-}
-} // Predicates = [HasAVX2]
+} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
let Constraints = "$src1 = $dst" in {
defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
@@ -4247,17 +4249,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
//===---------------------------------------------------------------------===//
defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 1>;
+ SSE_INTALU_ITINS_P, 1, NoVLX>;
defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
- SSE_INTALU_ITINS_P, 0>;
+ SSE_INTALU_ITINS_P, 0, NoVLX>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Shuffle Instructions
@@ -4511,40 +4513,43 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
Sched<[WriteShuffleLd, ReadAfterLd]>;
}
-let Predicates = [HasAVX] in {
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
bc_v16i8, loadv2i64, 0>, VEX_4V;
defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
bc_v8i16, loadv2i64, 0>, VEX_4V;
- defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
- bc_v4i32, loadv2i64, 0>, VEX_4V;
- defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
- bc_v2i64, loadv2i64, 0>, VEX_4V;
-
defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
bc_v16i8, loadv2i64, 0>, VEX_4V;
defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
bc_v8i16, loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+ bc_v4i32, loadv2i64, 0>, VEX_4V;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+ bc_v2i64, loadv2i64, 0>, VEX_4V;
defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
bc_v4i32, loadv2i64, 0>, VEX_4V;
defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
bc_v2i64, loadv2i64, 0>, VEX_4V;
}
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
bc_v32i8>, VEX_4V, VEX_L;
defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
bc_v16i16>, VEX_4V, VEX_L;
- defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
- bc_v8i32>, VEX_4V, VEX_L;
- defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
- bc_v4i64>, VEX_4V, VEX_L;
-
defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
bc_v32i8>, VEX_4V, VEX_L;
defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
bc_v16i16>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+ bc_v8i32>, VEX_4V, VEX_L;
+ defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+ bc_v4i64>, VEX_4V, VEX_L;
defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
bc_v8i32>, VEX_4V, VEX_L;
defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
@@ -4600,7 +4605,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
}
// Extract
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4615,7 +4620,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
Sched<[WriteShuffleLd, ReadAfterLd]>;
// Insert
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
@@ -4683,7 +4688,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
-// SSE2 - Move Doubleword
+// SSE2 - Move Doubleword/Quadword
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
@@ -4770,23 +4775,23 @@ let isCodeGenOnly = 1 in {
//
def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
Sched<[WriteMove]>;
def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (vector_extract (v4i32 VR128:$src),
+ [(store (i32 (extractelt (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
VEX, Sched<[WriteStore]>;
def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
Sched<[WriteMove]>;
def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
- [(store (i32 (vector_extract (v4i32 VR128:$src),
+ [(store (i32 (extractelt (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4808,24 +4813,25 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
let SchedRW = [WriteMove] in {
def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
- (iPTR 0)))],
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+ (iPTR 0)))],
IIC_SSE_MOVD_ToGP>,
VEX;
def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
(iPTR 0)))],
IIC_SSE_MOVD_ToGP>;
} //SchedRW
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst),
- (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}",
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src),
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4883,30 +4889,18 @@ let isCodeGenOnly = 1 in {
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
}
-//===---------------------------------------------------------------------===//
-// Patterns and instructions to describe movd/movq to XMM register zero-extends
-//
-let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
-let AddedComplexity = 15 in {
-def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
- "movq\t{$src, $dst|$dst, $src}", // X86-64 only
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector GR64:$src)))))],
- IIC_SSE_MOVDQ>,
- VEX, VEX_W;
-def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
- "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (v2i64 (scalar_to_vector GR64:$src)))))],
- IIC_SSE_MOVDQ>;
-}
-} // isCodeGenOnly, SchedRW
-
let Predicates = [UseAVX] in {
- let AddedComplexity = 15 in
+ let AddedComplexity = 15 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
(VMOVDI2PDIrr GR32:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (VMOV64toPQIrr GR64:$src)>;
+
+ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+ }
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
let AddedComplexity = 20 in {
@@ -4924,16 +4918,16 @@ let Predicates = [UseAVX] in {
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
}
let Predicates = [UseSSE2] in {
- let AddedComplexity = 15 in
+ let AddedComplexity = 15 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
(MOVDI2PDIrr GR32:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+ (MOV64toPQIrr GR64:$src)>;
+ }
let AddedComplexity = 20 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
@@ -4985,12 +4979,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (vector_extract (v2i64 VR128:$src),
+ [(store (i64 (extractelt (v2i64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, VEX;
def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
- [(store (i64 (vector_extract (v2i64 VR128:$src),
+ [(store (i64 (extractelt (v2i64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>;
} // ExeDomain, SchedRW
@@ -5119,7 +5113,7 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
v4f32, VR128, loadv4f32, f128mem>, VEX;
defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
@@ -5134,7 +5128,7 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
memopv4f32, f128mem>;
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (X86Movshdup VR128:$src)),
(VMOVSHDUPrr VR128:$src)>;
def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -5190,21 +5184,30 @@ def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (v4f64 (X86Movddup
- (scalar_to_vector (loadf64 addr:$src)))))]>,
+ (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
Sched<[WriteLoad]>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
}
defm MOVDDUP : sse3_replicate_dfp<"movddup">;
-let Predicates = [HasAVX] in {
+
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(X86Movddup (loadv2f64 addr:$src)),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+ // 256-bit version
+ def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+ (VMOVDDUPYrm addr:$src)>;
+ def : Pat<(X86Movddup (v4i64 VR256:$src)),
+ (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
@@ -5212,16 +5215,6 @@ let Predicates = [HasAVX] in {
def : Pat<(X86Movddup (bc_v2f64
(v2i64 (scalar_to_vector (loadi64 addr:$src))))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-
- // 256-bit version
- def : Pat<(X86Movddup (loadv4f64 addr:$src)),
- (VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (loadv4i64 addr:$src)),
- (VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
- (VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (v4i64 VR256:$src)),
- (VMOVDDUPYrr VR256:$src)>;
}
let Predicates = [UseAVX, OptForSize] in {
@@ -5791,37 +5784,37 @@ let Predicates = [HasAVX2] in
let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
defm PALIGN : ssse3_palignr<"palignr">;
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+ (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
}
let Predicates = [UseSSSE3] in {
def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+ (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
}
//===---------------------------------------------------------------------===//
@@ -6145,7 +6138,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
imm:$src2)))), addr:$dst)]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
@@ -6170,7 +6163,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
imm:$src2)))), addr:$dst)]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
@@ -6194,7 +6187,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
addr:$dst)]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
@@ -6217,7 +6210,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
addr:$dst)]>, REX_W;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
@@ -6285,7 +6278,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoBWI] in
defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
let Constraints = "$src1 = $dst" in
defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
@@ -6311,7 +6304,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
let Constraints = "$src1 = $dst" in
defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
@@ -6337,7 +6330,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX, NoDQI] in
defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
let Constraints = "$src1 = $dst" in
defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
@@ -6543,71 +6536,71 @@ let Predicates = [HasAVX] in {
let Predicates = [UseAVX] in {
def : Pat<(ffloor FR32:$src),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
}
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (ffloor VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x1))>;
+ (VROUNDPSr VR128:$src, (i32 0x9))>;
def : Pat<(v4f32 (fnearbyint VR128:$src)),
(VROUNDPSr VR128:$src, (i32 0xC))>;
def : Pat<(v4f32 (fceil VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x2))>;
+ (VROUNDPSr VR128:$src, (i32 0xA))>;
def : Pat<(v4f32 (frint VR128:$src)),
(VROUNDPSr VR128:$src, (i32 0x4))>;
def : Pat<(v4f32 (ftrunc VR128:$src)),
- (VROUNDPSr VR128:$src, (i32 0x3))>;
+ (VROUNDPSr VR128:$src, (i32 0xB))>;
def : Pat<(v2f64 (ffloor VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x1))>;
+ (VROUNDPDr VR128:$src, (i32 0x9))>;
def : Pat<(v2f64 (fnearbyint VR128:$src)),
(VROUNDPDr VR128:$src, (i32 0xC))>;
def : Pat<(v2f64 (fceil VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x2))>;
+ (VROUNDPDr VR128:$src, (i32 0xA))>;
def : Pat<(v2f64 (frint VR128:$src)),
(VROUNDPDr VR128:$src, (i32 0x4))>;
def : Pat<(v2f64 (ftrunc VR128:$src)),
- (VROUNDPDr VR128:$src, (i32 0x3))>;
+ (VROUNDPDr VR128:$src, (i32 0xB))>;
def : Pat<(v8f32 (ffloor VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0x1))>;
+ (VROUNDYPSr VR256:$src, (i32 0x9))>;
def : Pat<(v8f32 (fnearbyint VR256:$src)),
(VROUNDYPSr VR256:$src, (i32 0xC))>;
def : Pat<(v8f32 (fceil VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0x2))>;
+ (VROUNDYPSr VR256:$src, (i32 0xA))>;
def : Pat<(v8f32 (frint VR256:$src)),
(VROUNDYPSr VR256:$src, (i32 0x4))>;
def : Pat<(v8f32 (ftrunc VR256:$src)),
- (VROUNDYPSr VR256:$src, (i32 0x3))>;
+ (VROUNDYPSr VR256:$src, (i32 0xB))>;
def : Pat<(v4f64 (ffloor VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0x1))>;
+ (VROUNDYPDr VR256:$src, (i32 0x9))>;
def : Pat<(v4f64 (fnearbyint VR256:$src)),
(VROUNDYPDr VR256:$src, (i32 0xC))>;
def : Pat<(v4f64 (fceil VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0x2))>;
+ (VROUNDYPDr VR256:$src, (i32 0xA))>;
def : Pat<(v4f64 (frint VR256:$src)),
(VROUNDYPDr VR256:$src, (i32 0x4))>;
def : Pat<(v4f64 (ftrunc VR256:$src)),
- (VROUNDYPDr VR256:$src, (i32 0x3))>;
+ (VROUNDYPDr VR256:$src, (i32 0xB))>;
}
defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6619,47 +6612,47 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
(ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
(ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+ (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+ (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
def : Pat<(v4f32 (ffloor VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x1))>;
+ (ROUNDPSr VR128:$src, (i32 0x9))>;
def : Pat<(v4f32 (fnearbyint VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0xC))>;
def : Pat<(v4f32 (fceil VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x2))>;
+ (ROUNDPSr VR128:$src, (i32 0xA))>;
def : Pat<(v4f32 (frint VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x4))>;
def : Pat<(v4f32 (ftrunc VR128:$src)),
- (ROUNDPSr VR128:$src, (i32 0x3))>;
+ (ROUNDPSr VR128:$src, (i32 0xB))>;
def : Pat<(v2f64 (ffloor VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x1))>;
+ (ROUNDPDr VR128:$src, (i32 0x9))>;
def : Pat<(v2f64 (fnearbyint VR128:$src)),
(ROUNDPDr VR128:$src, (i32 0xC))>;
def : Pat<(v2f64 (fceil VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x2))>;
+ (ROUNDPDr VR128:$src, (i32 0xA))>;
def : Pat<(v2f64 (frint VR128:$src)),
(ROUNDPDr VR128:$src, (i32 0x4))>;
def : Pat<(v2f64 (ftrunc VR128:$src)),
- (ROUNDPDr VR128:$src, (i32 0x3))>;
+ (ROUNDPDr VR128:$src, (i32 0xB))>;
}
//===----------------------------------------------------------------------===//
@@ -7815,13 +7808,7 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
// VBROADCAST - Load from memory and broadcast to all elements of the
// destination operand
//
-class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
- X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
- AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
-
-class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType VT,
PatFrag ld_frag, SchedWrite Sched> :
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
@@ -7832,38 +7819,33 @@ class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
// AVX2 adds register forms
-class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
- Intrinsic Int, SchedWrite Sched> :
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
+ [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+ Sched<[Sched]>, VEX;
let ExeDomain = SSEPackedSingle in {
- def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+ def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
f32mem, v4f32, loadf32, WriteLoad>;
- def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+ def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
f32mem, v8f32, loadf32,
WriteFShuffleLd>, VEX_L;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
v4f64, loadf64, WriteFShuffleLd>, VEX_L;
-def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
- int_x86_avx_vbroadcastf128_pd_256,
- WriteFShuffleLd>, VEX_L;
let ExeDomain = SSEPackedSingle in {
- def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
- int_x86_avx2_vbroadcast_ss_ps,
- WriteFShuffle>;
- def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
- int_x86_avx2_vbroadcast_ss_ps_256,
- WriteFShuffle256>, VEX_L;
+ def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+ v4f32, v4f32, WriteFShuffle>;
+ def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+ v8f32, v4f32, WriteFShuffle256>, VEX_L;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
- int_x86_avx2_vbroadcast_sd_pd_256,
- WriteFShuffle256>, VEX_L;
+def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+ v4f64, v2f64, WriteFShuffle256>, VEX_L;
let mayLoad = 1, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
@@ -7871,6 +7853,13 @@ def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteLoad]>, VEX, VEX_L;
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+ (ins f128mem:$src),
+ "vbroadcastf128\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst,
+ (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+ Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
let Predicates = [HasAVX] in
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
(VBROADCASTF128 addr:$src)>;
@@ -7891,7 +7880,7 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,
@@ -8080,17 +8069,19 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
(bitconvert (i_frag addr:$src2))))]>, VEX_4V,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
- def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+ let Predicates = [HasAVX, NoVLX] in {
+ def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
Sched<[WriteFShuffle]>;
- def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+ def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
Sched<[WriteFShuffleLd]>;
+ }// Predicates = [HasAVX, NoVLX]
}
let ExeDomain = SSEPackedSingle in {
@@ -8106,7 +8097,7 @@ let ExeDomain = SSEPackedDouble in {
loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
(VPERMILPSYrr VR256:$src1, VR256:$src2)>;
def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
@@ -8245,11 +8236,11 @@ let Predicates = [HasF16C] in {
def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16
+ def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
(int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
addr:$dst),
(VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16
+ def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
(int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
addr:$dst),
(VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
@@ -8309,97 +8300,62 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
//
multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, PatFrag ld_frag,
- Intrinsic Int128, Intrinsic Int256> {
- def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+ let Predicates = [HasAVX2, prd] in {
+ def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int128 VR128:$src))]>,
+ [(set VR128:$dst,
+ (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
Sched<[WriteShuffle]>, VEX;
- def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+ def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
+ (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
Sched<[WriteLoad]>, VEX;
- def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+ def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (Int256 VR128:$src))]>,
+ [(set VR256:$dst,
+ (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
Sched<[WriteShuffle256]>, VEX, VEX_L;
- def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+ def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+ (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
Sched<[WriteLoad]>, VEX, VEX_L;
+
+ // Provide aliases for broadcast from the same register class that
+ // automatically does the extract.
+ def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+ (!cast<Instruction>(NAME#"Yrr")
+ (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+ }
}
defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
- int_x86_avx2_pbroadcastb_128,
- int_x86_avx2_pbroadcastb_256>;
+ v16i8, v32i8, NoVLX_Or_NoBWI>;
defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
- int_x86_avx2_pbroadcastw_128,
- int_x86_avx2_pbroadcastw_256>;
+ v8i16, v16i16, NoVLX_Or_NoBWI>;
defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
- int_x86_avx2_pbroadcastd_128,
- int_x86_avx2_pbroadcastd_256>;
+ v4i32, v8i32, NoVLX>;
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
- int_x86_avx2_pbroadcastq_128,
- int_x86_avx2_pbroadcastq_256>;
+ v2i64, v4i64, NoVLX>;
let Predicates = [HasAVX2] in {
- def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
- (VPBROADCASTBrm addr:$src)>;
- def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
- (VPBROADCASTBYrm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWYrm addr:$src)>;
- def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDrm addr:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDYrm addr:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
- (VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
- (VPBROADCASTQYrm addr:$src)>;
-
- def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
- (VPBROADCASTBrr VR128:$src)>;
- def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
- (VPBROADCASTBYrr VR128:$src)>;
- def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
- (VPBROADCASTWrr VR128:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
- (VPBROADCASTWYrr VR128:$src)>;
- def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
- (VPBROADCASTDrr VR128:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
- (VPBROADCASTDYrr VR128:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
- (VPBROADCASTQrr VR128:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
- (VPBROADCASTQYrr VR128:$src)>;
- def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
- (VBROADCASTSSrr VR128:$src)>;
- def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
- (VBROADCASTSSYrr VR128:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
- (VPBROADCASTQrr VR128:$src)>;
- def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
- (VBROADCASTSDYrr VR128:$src)>;
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWYrm addr:$src)>;
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
- def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
- (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
- sub_xmm)))>;
- def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
- (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
- sub_xmm)))>;
- def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
- (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
- sub_xmm)))>;
- def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
- (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
- sub_xmm)))>;
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
(VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
sub_xmm)))>;
@@ -8598,7 +8554,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
}
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),
(VINSERTI128rr VR256:$src1, VR128:$src2,
@@ -8722,16 +8678,16 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
int_x86_avx2_maskstore_q,
int_x86_avx2_maskstore_q_256>, VEX_W;
-def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
(VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
(VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
(VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
-def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
(VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
@@ -8776,10 +8732,10 @@ def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0)
(VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
VR128:$mask)>;
-def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
(VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
(VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
@@ -8804,10 +8760,10 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0)
(VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
VR256:$mask)>;
-def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
(VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
-def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
(VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
@@ -8865,12 +8821,13 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
}
-defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
-defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
-defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
-defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
-defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
-
+let Predicates = [HasAVX2, NoVLX] in {
+ defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+ defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+ defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+}
//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations
multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
@@ -8905,3 +8862,59 @@ let mayLoad = 1, Constraints
defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
}
}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+ (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+ (COPY_TO_REGCLASS
+ (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+ FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+ (COPY_TO_REGCLASS
+ (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+ (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index caecf7001ef5..c1df9780a0e0 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -31,21 +31,21 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
} // Uses = [CL]
-def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"shl{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
-def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shl{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
OpSize16;
-def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"shl{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
OpSize32;
def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
- (ins GR64:$src1, i8imm:$src2),
+ (ins GR64:$src1, u8imm:$src2),
"shl{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
IIC_SR>;
@@ -85,19 +85,19 @@ def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t{%cl, $dst|$dst, cl}",
[(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
-def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
"shl{b}\t{$src, $dst|$dst, $src}",
[(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
-def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
"shl{w}\t{$src, $dst|$dst, $src}",
[(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize16;
-def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
"shl{l}\t{$src, $dst|$dst, $src}",
[(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize32;
-def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
"shl{q}\t{$src, $dst|$dst, $src}",
[(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
@@ -137,18 +137,18 @@ def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
}
-def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
"shr{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"shr{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize16;
-def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"shr{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize32;
-def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
"shr{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
@@ -185,19 +185,19 @@ def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t{%cl, $dst|$dst, cl}",
[(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
}
-def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
"shr{b}\t{$src, $dst|$dst, $src}",
[(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
-def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
"shr{w}\t{$src, $dst|$dst, $src}",
[(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize16;
-def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
"shr{l}\t{$src, $dst|$dst, $src}",
[(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize32;
-def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
"shr{q}\t{$src, $dst|$dst, $src}",
[(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
@@ -241,20 +241,20 @@ def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
IIC_SR>;
}
-def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"sar{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
IIC_SR>;
-def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"sar{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize16;
-def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"sar{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize32;
def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
- (ins GR64:$src1, i8imm:$src2),
+ (ins GR64:$src1, u8imm:$src2),
"sar{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
IIC_SR>;
@@ -298,19 +298,19 @@ def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
[(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
-def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
"sar{b}\t{$src, $dst|$dst, $src}",
[(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
-def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
"sar{w}\t{$src, $dst|$dst, $src}",
[(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize16;
-def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
"sar{l}\t{$src, $dst|$dst, $src}",
[(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize32;
-def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
"sar{q}\t{$src, $dst|$dst, $src}",
[(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
@@ -342,7 +342,7 @@ let hasSideEffects = 0 in {
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
"rcl{b}\t$dst", [], IIC_SR>;
-def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
"rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
@@ -350,7 +350,7 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
"rcl{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
"rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
let Uses = [CL] in
def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
@@ -358,7 +358,7 @@ def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
"rcl{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
"rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
let Uses = [CL] in
def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
@@ -367,7 +367,7 @@ def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
"rcl{q}\t$dst", [], IIC_SR>;
-def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
"rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -376,7 +376,7 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
"rcr{b}\t$dst", [], IIC_SR>;
-def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
"rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
@@ -384,7 +384,7 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
"rcr{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
"rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
let Uses = [CL] in
def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
@@ -392,7 +392,7 @@ def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
"rcr{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
"rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
let Uses = [CL] in
def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
@@ -400,7 +400,7 @@ def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
"rcr{q}\t$dst", [], IIC_SR>;
-def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
"rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in
def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
@@ -411,36 +411,36 @@ def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
let SchedRW = [WriteShiftLd, WriteRMW] in {
def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
"rcl{b}\t$dst", [], IIC_SR>;
-def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
"rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
"rcl{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
"rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
"rcl{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
"rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
"rcl{q}\t$dst", [], IIC_SR>;
-def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
"rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
"rcr{b}\t$dst", [], IIC_SR>;
-def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
"rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
"rcr{w}\t$dst", [], IIC_SR>, OpSize16;
-def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
"rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
"rcr{l}\t$dst", [], IIC_SR>, OpSize32;
-def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
"rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
"rcr{q}\t$dst", [], IIC_SR>;
-def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
"rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
let Uses = [CL] in {
@@ -482,19 +482,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
}
-def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize16;
-def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize32;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
- (ins GR64:$src1, i8imm:$src2),
+ (ins GR64:$src1, u8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
IIC_SR>;
@@ -537,19 +537,19 @@ def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
[(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
-def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
+def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
"rol{b}\t{$src1, $dst|$dst, $src1}",
[(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
IIC_SR>;
-def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
+def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
"rol{w}\t{$src1, $dst|$dst, $src1}",
[(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
IIC_SR>, OpSize16;
-def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
+def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
"rol{l}\t{$src1, $dst|$dst, $src1}",
[(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
IIC_SR>, OpSize32;
-def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
"rol{q}\t{$src1, $dst|$dst, $src1}",
[(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
IIC_SR>;
@@ -589,19 +589,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
[(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
}
-def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"ror{b}\t{$src2, $dst|$dst, $src2}",
[(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"ror{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize16;
-def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"ror{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
IIC_SR>, OpSize32;
def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
- (ins GR64:$src1, i8imm:$src2),
+ (ins GR64:$src1, u8imm:$src2),
"ror{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
IIC_SR>;
@@ -644,19 +644,19 @@ def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
[(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
IIC_SR>;
}
-def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
"ror{b}\t{$src, $dst|$dst, $src}",
[(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
-def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
"ror{w}\t{$src, $dst|$dst, $src}",
[(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize16;
-def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
"ror{l}\t{$src, $dst|$dst, $src}",
[(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>, OpSize32;
-def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
"ror{q}\t{$src, $dst|$dst, $src}",
[(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
IIC_SR>;
@@ -727,42 +727,42 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
let isCommutable = 1 in { // These instructions commute to each other.
def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
(outs GR16:$dst),
- (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
(i8 imm:$src3)))], IIC_SHD16_REG_IM>,
TB, OpSize16;
def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
(outs GR16:$dst),
- (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ (ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
(i8 imm:$src3)))], IIC_SHD16_REG_IM>,
TB, OpSize16;
def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
(outs GR32:$dst),
- (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
(i8 imm:$src3)))], IIC_SHD32_REG_IM>,
TB, OpSize32;
def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
(outs GR32:$dst),
- (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ (ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
(i8 imm:$src3)))], IIC_SHD32_REG_IM>,
TB, OpSize32;
def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
(outs GR64:$dst),
- (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
(i8 imm:$src3)))], IIC_SHD64_REG_IM>,
TB;
def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(outs GR64:$dst),
- (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ (ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
(i8 imm:$src3)))], IIC_SHD64_REG_IM>,
@@ -801,14 +801,14 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
}
def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
- (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2,
(i8 imm:$src3)), addr:$dst)],
IIC_SHD16_MEM_IM>,
TB, OpSize16;
def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
- (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
(i8 imm:$src3)), addr:$dst)],
@@ -816,14 +816,14 @@ def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
TB, OpSize16;
def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
- (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2,
(i8 imm:$src3)), addr:$dst)],
IIC_SHD32_MEM_IM>,
TB, OpSize32;
def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
- (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
(i8 imm:$src3)), addr:$dst)],
@@ -831,14 +831,14 @@ def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
TB, OpSize32;
def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
- (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2,
(i8 imm:$src3)), addr:$dst)],
IIC_SHD64_MEM_IM>,
TB;
def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
- (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
(i8 imm:$src3)), addr:$dst)],
@@ -860,12 +860,12 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{
multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
let hasSideEffects = 0 in {
- def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
+ def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, TAXD, VEX, Sched<[WriteShift]>;
let mayLoad = 1 in
def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
- (ins x86memop:$src1, i8imm:$src2),
+ (ins x86memop:$src1, u8imm:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, TAXD, VEX, Sched<[WriteShiftLd]>;
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 0350566f8b9b..85e17f516f91 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -44,7 +44,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
let SchedRW = [WriteSystem] in {
-def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
[(int_x86_int imm:$trap)], IIC_INT>;
@@ -60,12 +60,6 @@ def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
IIC_SYS_ENTER_EXIT>, TB;
def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
-
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>,
- OpSize32;
-def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
- Requires<[In64BitMode]>;
} // SchedRW
def : Pat<(debugtrap),
@@ -88,13 +82,13 @@ def IN32rr : I<0xED, RawFrm, (outs), (ins),
"in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
let Defs = [AL] in
-def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
"in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
let Defs = [AX] in
-def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
"in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
let Defs = [EAX] in
-def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
"in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
let Uses = [DX, AL] in
@@ -108,13 +102,13 @@ def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
"out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
let Uses = [AL] in
-def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
"out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
let Uses = [AX] in
-def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
"out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
let Uses = [EAX] in
-def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
"out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
} // SchedRW
@@ -478,39 +472,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
//===----------------------------------------------------------------------===//
// XSAVE instructions
let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
let Defs = [EDX, EAX], Uses = [ECX] in
def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
let Uses = [EDX, EAX, ECX] in
def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+}
-let Uses = [RDX, RAX] in {
- def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
- "xsave\t$dst", []>, TB;
- def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
- "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>;
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+ def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave\t$dst",
+ [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+ def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave64\t$dst",
+ [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
- "xrstor\t$dst", []>, TB;
+ "xrstor\t$dst",
+ [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
- "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>;
- def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
- "xsaveopt\t$dst", []>, PS;
- def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
- "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>;
-
+ "xrstor64\t$dst",
+ [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+ def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt\t$dst",
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB;
+ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt64\t$dst",
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+ def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec\t$dst",
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+ def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec64\t$dst",
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+ def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves\t$dst",
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+ def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves64\t$dst",
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
- "xrstors\t$dst", []>, TB;
+ "xrstors\t$dst",
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
- "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>;
- def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
- "xsavec\t$dst", []>, TB;
- def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins),
- "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>;
- def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
- "xsaves\t$dst", []>, TB;
- def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins),
- "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>;
+ "xrstors64\t$dst",
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
}
+} // Uses
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -534,6 +549,12 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
}
let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU - enable protection key
+let Defs = [EAX, EDX], Uses = [ECX] in
+ def RDPKRU : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+ def WRPKRU : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
//===----------------------------------------------------------------------===//
// FS/GS Base Instructions
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 8455b8d8467c..4cb2304e464d 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -83,57 +83,64 @@ let ExeDomain = SSEPackedDouble in {
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
}
-multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3;
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+ XOP_4VOp3, Sched<[WriteVarVecShift]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>,
- XOP_4V, VEX_W;
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>,
- XOP_4VOp3;
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+ (vt128 VR128:$src2))))]>,
+ XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
- defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
- defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
- defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
- defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
- defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
- defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
- defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
- defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
- defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
- defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
- defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
- defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+ defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+ defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+ defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+ defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+ defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+ defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+ defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+ defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+ defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+ defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+ defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+ defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
}
-multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType vt128> {
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, i8imm:$src2),
+ (ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP;
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins i128mem:$src1, i8imm:$src2),
+ (ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP;
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
}
let ExeDomain = SSEPackedInt in {
- defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
- defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
- defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
- defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
}
// Instruction where second source can be memory, but third must be register
@@ -170,30 +177,34 @@ let ExeDomain = SSEPackedInt in {
}
// Instruction where second source can be memory, third must be imm8
-multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
let isCommutable = 1 in
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, XOPCC:$cc),
!strconcat("vpcom${cc}", Suffix,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>,
+ [(set VR128:$dst,
+ (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+ i8immZExt3:$cc)))]>,
XOP_4V;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
!strconcat("vpcom${cc}", Suffix,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
- i8immZExt3:$cc))]>, XOP_4V;
+ (vt128 (OpNode (vt128 VR128:$src1),
+ (vt128 (bitconvert (loadv2i64 addr:$src2))),
+ i8immZExt3:$cc)))]>,
+ XOP_4V;
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
!strconcat("vpcom", Suffix,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, XOP_4V;
let mayLoad = 1 in
def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
!strconcat("vpcom", Suffix,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, XOP_4V;
@@ -201,14 +212,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
}
let ExeDomain = SSEPackedInt in { // SSE integer instructions
- defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
- defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
- defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
- defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
- defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
- defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
- defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
- defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
+ defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+ defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+ defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+ defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+ defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+ defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+ defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+ defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
}
// Instruction where either second or third source can be memory
@@ -270,42 +281,52 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
let ExeDomain = SSEPackedInt in
defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+let Predicates = [HasXOP] in {
+ def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+ (X86andnp VR128:$src3, VR128:$src2))),
+ (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+ (X86andnp VR256:$src3, VR256:$src2))),
+ (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR128:$dst,
(Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR128:$dst,
(Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
VEX_W, MemOp4;
def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR128:$dst,
(Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR256:$dst,
(Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR256:$dst,
(Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
VEX_W, MemOp4, VEX_L;
def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR256:$dst,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2c8b95bcba22..dc6d85d582c8 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -18,14 +18,19 @@ namespace llvm {
enum IntrinsicType {
INTR_NO_TYPE,
- GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
- INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
- CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
- INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
- VPERM_3OP_MASKZ,
- INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
- EXPAND_FROM_MEM, BLEND
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+ INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+ INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+ FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
+ VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
+ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+ EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
};
struct IntrinsicData {
@@ -138,6 +143,42 @@ static const IntrinsicData IntrinsicsWithChain[] = {
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
@@ -209,6 +250,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -241,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
@@ -274,16 +318,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+ X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
-
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
@@ -371,6 +455,50 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK,
+ X86ISD::SUBV_BROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+ X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -388,6 +516,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+ X86ISD::FSETCC),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+ X86ISD::FSETCC),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -415,7 +547,184 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
-
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CONFLICT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTDQ2PD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0), // no rm
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP_ROUND, X86ISD::VFPROUND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VFPEXT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_EXTEND, X86ISD::VFPEXT),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::VFPEXT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+ ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTUDQ2PD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0), // no rm
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+ ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
@@ -452,6 +761,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
X86ISD::FGETEXP_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
@@ -464,6 +781,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FGETEXP_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FGETEXP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+ X86ISD::VGETMANT, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+ ISD::INSERT_SUBVECTOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
+ X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+ ISD::CTLZ, 0),
X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
@@ -472,10 +845,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
- X86ISD::FMAX_RND),
- X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX,
- X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMAX, X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
@@ -484,10 +857,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
- X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN,
- X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVDDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::MOVSS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSHDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
+ X86ISD::MOVSLDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
@@ -554,6 +949,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::PALIGNR, 0),
X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
@@ -596,6 +997,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
@@ -644,6 +1057,114 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
@@ -700,6 +1221,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
@@ -728,16 +1255,98 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::RNDSCALE, 0),
+ X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::RNDSCALE, 0),
+ X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
X86ISD::SCALEF, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
@@ -750,6 +1359,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SCALEF, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::SCALEF, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUF128, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::SHUFP, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
@@ -758,6 +1399,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
X86ISD::FSQRT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSQRT_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
@@ -782,9 +1427,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
-
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::UNPCKL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
+ X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ ISD::FP16_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM,
+ ISD::FP_TO_FP16, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
@@ -821,7 +1511,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
-
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
@@ -852,54 +1541,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
+ X86ISD::VPERMILPI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
@@ -910,7 +1601,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
-
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+ X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
@@ -959,14 +1661,59 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK,
+ X86ISD::VBROADCAST, 0),
+ X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+ X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -1017,6 +1764,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
@@ -1024,6 +1773,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+ X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
@@ -1066,12 +1816,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0),
X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0),
X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
@@ -1105,7 +1849,31 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
- X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0)
+ X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+ X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+ X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+ X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0),
+ X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+ X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+ X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
};
/*
@@ -1128,6 +1896,102 @@ static void verifyIntrinsicTables() {
std::is_sorted(std::begin(IntrinsicsWithChain),
std::end(IntrinsicsWithChain)) &&
"Intrinsic data tables should be sorted by Intrinsic ID");
+ assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+ std::end(IntrinsicsWithoutChain)) ==
+ std::end(IntrinsicsWithoutChain)) &&
+ (std::adjacent_find(std::begin(IntrinsicsWithChain),
+ std::end(IntrinsicsWithChain)) ==
+ std::end(IntrinsicsWithChain)) &&
+ "Intrinsic data tables should have unique entries");
+}
+
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
+#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
+#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
+#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
+#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
+#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
+#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
+#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
+#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
+#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
+#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
+#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
+#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
+#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
+#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
+#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
+#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
+#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
+#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */
+#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
+#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
+#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
+#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */
+#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
+#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
+#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
+#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
+#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
+#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
+#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+ ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+ unsigned IntImm = CImm->getZExtValue();
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (IntImm) {
+ default: llvm_unreachable("Invalid floating point compare value for Comi!");
+ case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling)
+ case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling)
+ return std::make_tuple(true, X86::COND_E);
+ case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling)
+ case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling)
+ return std::make_tuple(false , X86::COND_E);
+ case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling)
+ case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_B);
+ case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+ case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+ return std::make_tuple(false , X86::COND_B);
+ case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling)
+ case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_BE);
+ case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling)
+ case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_BE);
+ case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling)
+ case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_A);
+ case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling)
+ case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_A);
+ case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling)
+ case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+ return std::make_tuple(true, X86::COND_AE);
+ case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling)
+ case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling)
+ return std::make_tuple(false, X86::COND_AE);
+ case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling)
+ case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling)
+ return std::make_tuple(true, X86::COND_NE);
+ case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling)
+ case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling)
+ return std::make_tuple(false, X86::COND_NE);
+ }
}
} // End llvm namespace
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 3415cedc6fea..e186f7039b43 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -92,7 +92,6 @@ namespace llvm {
SmallVector<MCFixup, 4> Fixups;
raw_svector_ostream VecOS(Code);
CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
- VecOS.flush();
CurrentShadowSize += Code.size();
if (CurrentShadowSize >= RequiredShadowSize)
InShadow = false; // The shadow is big enough. Stop counting.
@@ -128,7 +127,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
/// operand to an MCSymbol.
MCSymbol *X86MCInstLower::
GetSymbolFromOperand(const MachineOperand &MO) const {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = MF.getDataLayout();
assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
MCSymbol *Sym = nullptr;
@@ -151,7 +150,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
}
if (!Suffix.empty())
- Name += DL->getPrivateGlobalPrefix();
+ Name += DL.getPrivateGlobalPrefix();
unsigned PrefixLen = Name.size();
@@ -159,7 +158,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
const GlobalValue *GV = MO.getGlobal();
AsmPrinter.getNameWithPrefix(Name, GV);
} else if (MO.isSymbol()) {
- Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL);
+ Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
} else if (MO.isMBB()) {
assert(Suffix.empty());
Sym = MO.getMBB()->getSymbol();
@@ -461,6 +460,7 @@ ReSimplify:
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
// if one of the registers is extended, but other isn't.
+ case X86::VMOVZPQILo2PQIrr:
case X86::VMOVAPDrr:
case X86::VMOVAPDYrr:
case X86::VMOVAPSrr:
@@ -478,18 +478,19 @@ ReSimplify:
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
- case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
- case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
- case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
- case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
- case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
- case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
- case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
- case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
- case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
- case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
- case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
- case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+ case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
+ case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
+ case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+ case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
+ case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+ case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
+ case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+ case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
+ case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+ case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
+ case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+ case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
+ case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
}
OutMI.setOpcode(NewOpc);
}
@@ -532,6 +533,23 @@ ReSimplify:
break;
}
+ case X86::CLEANUPRET: {
+ // Replace CATCHRET with the appropriate RET.
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Replace CATCHRET with the appropriate RET.
+ const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+ unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ OutMI = MCInst();
+ OutMI.setOpcode(getRetOpcode(Subtarget));
+ OutMI.addOperand(MCOperand::createReg(ReturnReg));
+ break;
+ }
+
// TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
case X86::TAILJMPr:
case X86::TAILJMPd:
@@ -598,17 +616,29 @@ ReSimplify:
case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+ case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+ case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+ case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+ case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+ case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+ case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+ case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+ case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+ case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+ case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+ case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+ case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
@@ -875,7 +905,10 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
MCInst LoadMI;
LoadMI.setOpcode(LoadOpcode);
- LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+ if (LoadDefRegister != X86::NoRegister)
+ LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
E = MI.operands_end();
I != E; ++I)
@@ -1062,6 +1095,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86ATTInstPrinter::getRegisterName(Reg));
break;
}
+ case X86::CLEANUPRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CLEANUPRET");
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CATCHRET");
+ break;
+ }
+
case X86::TAILJMPr:
case X86::TAILJMPm:
case X86::TAILJMPd:
@@ -1095,12 +1140,30 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+ const X86FrameLowering* FrameLowering =
+ MF->getSubtarget<X86Subtarget>().getFrameLowering();
+ bool hasFP = FrameLowering->hasFP(*MF);
+
+ // TODO: This is needed only if we require precise CFA.
+ bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+ !OutStreamer->getDwarfFrameInfos().back().End;
+
+ int stackGrowth = -RI->getSlotSize();
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+ }
+
// Emit the label.
OutStreamer->EmitLabel(PICBase);
// popl $reg
EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
.addReg(MI->getOperand(0).getReg()));
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+ }
return;
}
@@ -1206,19 +1269,48 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
- // Lower PSHUFB and VPERMILP normally but add a comment if we can find
- // a constant shuffle mask. We won't be able to do this at the MC layer
- // because the mask isn't an immediate.
+ // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+ // a constant shuffle mask. We won't be able to do this at the MC layer
+ // because the mask isn't an immediate.
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
- case X86::VPSHUFBYrm: {
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrm:
+ case X86::VPSHUFBZrmk:
+ case X86::VPSHUFBZrmkz: {
if (!OutStreamer->isVerboseAsm())
break;
- assert(MI->getNumOperands() > 5 &&
- "We should always have at least 5 operands!");
+ unsigned SrcIdx, MaskIdx;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ case X86::VPSHUFBYrm:
+ case X86::VPSHUFBZ128rm:
+ case X86::VPSHUFBZ256rm:
+ case X86::VPSHUFBZrm:
+ SrcIdx = 1; MaskIdx = 5; break;
+ case X86::VPSHUFBZ128rmkz:
+ case X86::VPSHUFBZ256rmkz:
+ case X86::VPSHUFBZrmkz:
+ SrcIdx = 2; MaskIdx = 6; break;
+ case X86::VPSHUFBZ128rmk:
+ case X86::VPSHUFBZ256rmk:
+ case X86::VPSHUFBZrmk:
+ SrcIdx = 3; MaskIdx = 7; break;
+ }
+
+ assert(MI->getNumOperands() >= 6 &&
+ "We should always have at least 6 operands!");
const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp = MI->getOperand(1);
- const MachineOperand &MaskOp = MI->getOperand(5);
+ const MachineOperand &SrcOp = MI->getOperand(SrcIdx);
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
@@ -1240,35 +1332,53 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineOperand &SrcOp = MI->getOperand(1);
const MachineOperand &MaskOp = MI->getOperand(5);
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break;
+ case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break;
+ }
+
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
- DecodeVPERMILPMask(C, Mask);
+ DecodeVPERMILPMask(C, ElSize, Mask);
if (!Mask.empty())
OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
}
break;
}
- // For loads from a constant pool to a vector register, print the constant
- // loaded.
- case X86::MOVAPDrm:
- case X86::VMOVAPDrm:
- case X86::VMOVAPDYrm:
- case X86::MOVUPDrm:
- case X86::VMOVUPDrm:
- case X86::VMOVUPDYrm:
- case X86::MOVAPSrm:
- case X86::VMOVAPSrm:
- case X86::VMOVAPSYrm:
- case X86::MOVUPSrm:
- case X86::VMOVUPSrm:
- case X86::VMOVUPSYrm:
- case X86::MOVDQArm:
- case X86::VMOVDQArm:
- case X86::VMOVDQAYrm:
- case X86::MOVDQUrm:
- case X86::VMOVDQUrm:
- case X86::VMOVDQUYrm:
+#define MOV_CASE(Prefix, Suffix) \
+ case X86::Prefix##MOVAPD##Suffix##rm: \
+ case X86::Prefix##MOVAPS##Suffix##rm: \
+ case X86::Prefix##MOVUPD##Suffix##rm: \
+ case X86::Prefix##MOVUPS##Suffix##rm: \
+ case X86::Prefix##MOVDQA##Suffix##rm: \
+ case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix) \
+ case X86::VMOVDQA64##Suffix##rm: \
+ case X86::VMOVDQA32##Suffix##rm: \
+ case X86::VMOVDQU64##Suffix##rm: \
+ case X86::VMOVDQU32##Suffix##rm: \
+ case X86::VMOVDQU16##Suffix##rm: \
+ case X86::VMOVDQU8##Suffix##rm: \
+ case X86::VMOVAPS##Suffix##rm: \
+ case X86::VMOVAPD##Suffix##rm: \
+ case X86::VMOVUPS##Suffix##rm: \
+ case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM() \
+ MOV_CASE(, ) /* SSE */ \
+ MOV_CASE(V, ) /* AVX-128 */ \
+ MOV_CASE(V, Y) /* AVX-256 */ \
+ MOV_AVX512_CASE(Z) \
+ MOV_AVX512_CASE(Z256) \
+ MOV_AVX512_CASE(Z128)
+
+ // For loads from a constant pool to a vector register, print the constant
+ // loaded.
+ CASE_ALL_MOV_RM()
if (!OutStreamer->isVerboseAsm())
break;
if (MI->getNumOperands() > 4)
@@ -1302,7 +1412,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (isa<UndefValue>(COp)) {
CS << "u";
} else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
- CS << CI->getZExtValue();
+ if (CI->getBitWidth() <= 64) {
+ CS << CI->getZExtValue();
+ } else {
+ // print multi-word constant as (w0,w1)
+ auto Val = CI->getValue();
+ CS << "(";
+ for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+ if (i > 0)
+ CS << ",";
+ CS << Val.getRawData()[i];
+ }
+ CS << ")";
+ }
} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
SmallString<32> Str;
CF->getValueAPF().toString(Str);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index ac2cdc8c6567..c9e636f1eb00 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index e6db9708b677..3a7a98db50f4 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -84,8 +84,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// of pushes to pass function parameters.
bool HasPushSequences = false;
- /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill
- /// slot for the frame pointer.
+ /// True if the function recovers from an SEH exception, and therefore needs
+ /// to spill and restore the frame pointer.
bool HasSEHFramePtrSave = false;
/// The frame index of a stack object containing the original frame pointer
@@ -100,7 +100,7 @@ private:
public:
X86MachineFunctionInfo() = default;
- explicit X86MachineFunctionInfo(MachineFunction &MF) {};
+ explicit X86MachineFunctionInfo(MachineFunction &MF) {}
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..58020d909a43
--- /dev/null
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,326 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does one thing:
+// 1) Address calculations in load and store instructions are replaced by
+// existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
+ cl::desc("X86: Enable LEA optimizations."),
+ cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+ OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+ const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+ /// \brief Loop over all of the basic blocks, replacing address
+ /// calculations in load and store instructions, if it's already
+ /// been calculated by LEA. Also, remove redundant LEAs.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ /// \brief Returns a distance between two instructions inside one basic block.
+ /// Negative result means, that instructions occur in reverse order.
+ int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+ /// \brief Choose the best \p LEA instruction from the \p List to replace
+ /// address calculation in \p MI instruction. Return the address displacement
+ /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+ /// and \p Dist.
+ bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&LEA,
+ int64_t &AddrDispShift, int &Dist);
+
+ /// \brief Returns true if two machine operand are identical and they are not
+ /// physical registers.
+ bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+ /// \brief Returns true if the instruction is LEA.
+ bool isLEA(const MachineInstr &MI);
+
+ /// \brief Returns true if two instructions have memory operands that only
+ /// differ by displacement. The numbers of the first memory operands for both
+ /// instructions are specified through \p N1 and \p N2. The address
+ /// displacement is returned through AddrDispShift.
+ bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2,
+ int64_t &AddrDispShift);
+
+ /// \brief Find all LEA instructions in the basic block.
+ void findLEAs(const MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineInstr *> &List);
+
+ /// \brief Removes redundant address calculations.
+ bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+ const X86RegisterInfo *TRI;
+
+ static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
+ const MachineBasicBlock *MBB = First.getParent();
+
+ // Both instructions must be in the same basic block.
+ assert(Last.getParent() == MBB &&
+ "Instructions are in different basic blocks");
+
+ return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
+ std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+// the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+// register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+// possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+ const MachineInstr &MI, MachineInstr *&LEA,
+ int64_t &AddrDispShift, int &Dist) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+ X86II::getOperandBias(Desc);
+
+ LEA = nullptr;
+
+ // Loop over all LEA instructions.
+ for (auto DefMI : List) {
+ int64_t AddrDispShiftTemp = 0;
+
+ // Compare instructions memory operands.
+ if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+ continue;
+
+ // Make sure address displacement fits 4 bytes.
+ if (!isInt<32>(AddrDispShiftTemp))
+ continue;
+
+ // Check that LEA def register can be used as MI address base. Some
+ // instructions can use a limited set of registers as address base, for
+ // example MOV8mr_NOREX. We could constrain the register class of the LEA
+ // def to suit MI, however since this case is very rare and hard to
+ // reproduce in a test it's just more reliable to skip the LEA.
+ if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+ MRI->getRegClass(DefMI->getOperand(0).getReg()))
+ continue;
+
+ // Choose the closest LEA instruction from the list, prior to MI if
+ // possible. Note that we took into account resulting address displacement
+ // as well. Also note that the list is sorted by the order in which the LEAs
+ // occur, so the break condition is pretty simple.
+ int DistTemp = calcInstrDist(*DefMI, MI);
+ assert(DistTemp != 0 &&
+ "The distance between two different instructions cannot be zero");
+ if (DistTemp > 0 || LEA == nullptr) {
+ // Do not update return LEA, if the current one provides a displacement
+ // which fits in 1 byte, while the new candidate does not.
+ if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+ isInt<8>(AddrDispShift))
+ continue;
+
+ LEA = DefMI;
+ AddrDispShift = AddrDispShiftTemp;
+ Dist = DistTemp;
+ }
+
+ // FIXME: Maybe we should not always stop at the first LEA after MI.
+ if (DistTemp < 0)
+ break;
+ }
+
+ return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+ const MachineOperand &MO2) {
+ return MO1.isIdenticalTo(MO2) &&
+ (!MO1.isReg() ||
+ !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+ const MachineInstr &MI2, unsigned N2,
+ int64_t &AddrDispShift) {
+ // Address base, scale, index and segment operands must be identical.
+ static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+ X86::AddrIndexReg, X86::AddrSegmentReg};
+ for (auto &N : IdenticalOpNums)
+ if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+ return false;
+
+ // Address displacement operands may differ by a constant.
+ const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+ const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+ if (!isIdenticalOp(*Op1, *Op2)) {
+ if (Op1->isImm() && Op2->isImm())
+ AddrDispShift = Op1->getImm() - Op2->getImm();
+ else if (Op1->isGlobal() && Op2->isGlobal() &&
+ Op1->getGlobal() == Op2->getGlobal())
+ AddrDispShift = Op1->getOffset() - Op2->getOffset();
+ else
+ return false;
+ }
+
+ return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineInstr *> &List) {
+ for (auto &MI : MBB) {
+ if (isLEA(MI))
+ List.push_back(const_cast<MachineInstr *>(&MI));
+ }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+ const SmallVectorImpl<MachineInstr *> &List) {
+ bool Changed = false;
+
+ assert(List.size() > 0);
+ MachineBasicBlock *MBB = List[0]->getParent();
+
+ // Process all instructions in basic block.
+ for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+ MachineInstr &MI = *I++;
+ unsigned Opcode = MI.getOpcode();
+
+ // Instruction must be load or store.
+ if (!MI.mayLoadOrStore())
+ continue;
+
+ // Get the number of the first memory operand.
+ const MCInstrDesc &Desc = MI.getDesc();
+ int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+ // If instruction has no memory operand - skip it.
+ if (MemOpNo < 0)
+ continue;
+
+ MemOpNo += X86II::getOperandBias(Desc);
+
+ // Get the best LEA instruction to replace address calculation.
+ MachineInstr *DefMI;
+ int64_t AddrDispShift;
+ int Dist;
+ if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+ continue;
+
+ // If LEA occurs before current instruction, we can freely replace
+ // the instruction. If LEA occurs after, we can lift LEA above the
+ // instruction and this way to be able to replace it. Since LEA and the
+ // instruction have similar memory operands (thus, the same def
+ // instructions for these operands), we can always do that, without
+ // worries of using registers before their defs.
+ if (Dist < 0) {
+ DefMI->removeFromParent();
+ MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+ }
+
+ // Since we can possibly extend register lifetime, clear kill flags.
+ MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+ ++NumSubstLEAs;
+ DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+ // Change instruction operands.
+ MI.getOperand(MemOpNo + X86::AddrBaseReg)
+ .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+ MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+ MI.getOperand(MemOpNo + X86::AddrIndexReg)
+ .ChangeToRegister(X86::NoRegister, false);
+ MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+ MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+ .ChangeToRegister(X86::NoRegister, false);
+
+ DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+
+ // Perform this optimization only if we care about code size.
+ if (!EnableX86LEAOpt || !MF.getFunction()->optForSize())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Process all basic blocks.
+ for (auto &MBB : MF) {
+ SmallVector<MachineInstr *, 16> LEAs;
+
+ // Find all LEA instructions in basic block.
+ findLEAs(MBB, LEAs);
+
+ // If current basic block has no LEAs, move on to the next one.
+ if (LEAs.empty())
+ continue;
+
+ // Remove redundant address calculations.
+ Changed |= removeRedundantAddrCalc(LEAs);
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 143e70bda9e7..0f425e28fa7d 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -93,8 +93,7 @@ FunctionPass *llvm::createX86PadShortFunctions() {
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// NOOP instructions before early exits.
bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
- if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
- MF.getFunction()->hasFnAttribute(Attribute::MinSize)) {
+ if (MF.getFunction()->optForSize()) {
return false;
}
@@ -107,7 +106,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
// Search through basic blocks and mark the ones that have early returns
ReturnBBs.clear();
VisitedBBs.clear();
- findReturns(MF.begin());
+ findReturns(&MF.front());
bool MadeChange = false;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index d8495e53e0e3..58404433e1ae 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
@@ -44,12 +43,6 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "X86GenRegisterInfo.inc"
-cl::opt<bool>
-ForceStackAlign("force-align-stack",
- cl::desc("Force align the stack to the minimum alignment"
- " needed for the function."),
- cl::init(false), cl::Hidden);
-
static cl::opt<bool>
EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -174,21 +167,34 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
if (Subtarget.isTarget64BitLP64())
return &X86::GR64_NOSPRegClass;
return &X86::GR32_NOSPRegClass;
- case 2: // Available for tailcall (not callee-saved GPRs).
- const Function *F = MF.getFunction();
- if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
- return &X86::GR64_TCW64RegClass;
- else if (Is64Bit)
- return &X86::GR64_TCRegClass;
-
- bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
- if (hasHipeCC)
- return &X86::GR32RegClass;
- return &X86::GR32_TCRegClass;
+ case 2: // NOREX GPRs.
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREXRegClass;
+ return &X86::GR32_NOREXRegClass;
+ case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+ if (Subtarget.isTarget64BitLP64())
+ return &X86::GR64_NOREX_NOSPRegClass;
+ return &X86::GR32_NOREX_NOSPRegClass;
+ case 4: // Available for tailcall (not callee-saved GPRs).
+ return getGPRsForTailCall(MF);
}
}
const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+ const Function *F = MF.getFunction();
+ if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+ return &X86::GR64_TCW64RegClass;
+ else if (Is64Bit)
+ return &X86::GR64_TCRegClass;
+
+ bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+ if (hasHipeCC)
+ return &X86::GR32RegClass;
+ return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
if (RC == &X86::CCRRegClass) {
if (Is64Bit)
@@ -222,6 +228,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const MCPhysReg *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
bool CallsEHReturn = MF->getMMI().callsEHReturn();
@@ -241,6 +248,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (HasAVX)
return CSR_64_RT_AllRegs_AVX_SaveList;
return CSR_64_RT_AllRegs_SaveList;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_SaveList;
+ break;
case CallingConv::Intel_OCL_BI: {
if (HasAVX512 && IsWin64)
return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
@@ -254,6 +265,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_64_Intel_OCL_BI_SaveList;
break;
}
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_SaveList;
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_SaveList;
@@ -264,6 +277,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (CallsEHReturn)
return CSR_64EHRet_SaveList;
return CSR_64_SaveList;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_SaveList;
+ else
+ return CSR_64_AllRegs_SaveList;
+ } else {
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_SaveList;
+ else
+ return CSR_32_AllRegs_SaveList;
+ }
default:
break;
}
@@ -284,6 +309,7 @@ const uint32_t *
X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ bool HasSSE = Subtarget.hasSSE1();
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
@@ -301,6 +327,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
if (HasAVX)
return CSR_64_RT_AllRegs_AVX_RegMask;
return CSR_64_RT_AllRegs_RegMask;
+ case CallingConv::CXX_FAST_TLS:
+ if (Is64Bit)
+ return CSR_64_TLS_Darwin_RegMask;
+ break;
case CallingConv::Intel_OCL_BI: {
if (HasAVX512 && IsWin64)
return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
@@ -314,16 +344,30 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_64_Intel_OCL_BI_RegMask;
break;
}
+ case CallingConv::HHVM:
+ return CSR_64_HHVM_RegMask;
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_RegMask;
break;
- default:
- break;
case CallingConv::X86_64_Win64:
return CSR_Win64_RegMask;
case CallingConv::X86_64_SysV:
return CSR_64_RegMask;
+ case CallingConv::X86_INTR:
+ if (Is64Bit) {
+ if (HasAVX)
+ return CSR_64_AllRegs_AVX_RegMask;
+ else
+ return CSR_64_AllRegs_RegMask;
+ } else {
+ if (HasSSE)
+ return CSR_32_AllRegs_SSE_RegMask;
+ else
+ return CSR_32_AllRegs_RegMask;
+ }
+ default:
+ break;
}
// Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
@@ -341,6 +385,10 @@ X86RegisterInfo::getNoPreservedMask() const {
return CSR_NoRegs_RegMask;
}
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+ return CSR_64_TLS_Darwin_RegMask;
+}
+
BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const X86FrameLowering *TFI = getFrameLowering(MF);
@@ -371,8 +419,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
"Stack realignment in presence of dynamic allocas is not supported with"
"this calling convention.");
- unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64,
- false);
+ unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
I.isValid(); ++I)
Reserved.set(*I);
@@ -439,6 +486,10 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
// Stack Frame Processing methods
//===----------------------------------------------------------------------===//
+static bool CantUseSP(const MachineFrameInfo *MFI) {
+ return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
+}
+
bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -451,13 +502,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// reference locals while also adjusting the stack pointer. When we can't
// use both the SP and the FP, we need a separate base pointer register.
bool CantUseFP = needsStackRealignment(MF);
- bool CantUseSP =
- MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
- return CantUseFP && CantUseSP;
+ return CantUseFP && CantUseSP(MFI);
}
bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+ if (!TargetRegisterInfo::canRealignStack(MF))
return false;
const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -470,26 +519,11 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
// If a base pointer is necessary. Check that it isn't too late to reserve
// it.
- if (MFI->hasVarSizedObjects())
+ if (CantUseSP(MFI))
return MRI->canReserveReg(BasePtr);
return true;
}
-bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const X86FrameLowering *TFI = getFrameLowering(MF);
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
- F->hasFnAttribute(Attribute::StackAlignment));
-
- // If we've requested that we force align the stack do so now.
- if (ForceStackAlign)
- return canRealignStack(MF);
-
- return requiresRealignment && canRealignStack(MF);
-}
-
bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
unsigned Reg, int &FrameIdx) const {
// Since X86 defines assignCalleeSavedSpillSlots which always return true
@@ -510,6 +544,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned Opc = MI.getOpcode();
bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
if (hasBasePointer(MF))
BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
else if (needsStackRealignment(MF))
@@ -524,14 +559,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// offset is from the traditional base pointer location. On 64-bit, the
// offset is from the SP at the end of the prologue, not the FP location. This
// matches the behavior of llvm.frameaddress.
+ unsigned IgnoredFrameReg;
if (Opc == TargetOpcode::LOCAL_ESCAPE) {
MachineOperand &FI = MI.getOperand(FIOperandNum);
- bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
int Offset;
- if (IsWinEH)
- Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex);
- else
- Offset = TFI->getFrameIndexOffset(MF, FrameIndex);
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
FI.ChangeToImmediate(Offset);
return;
}
@@ -540,7 +572,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// register as source operand, semantic is the same and destination is
// 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
- BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false);
+ BasePtr = getX86SubSuperRegister(BasePtr, 64);
// This must be part of a four operand memory reference. Replace the
// FrameIndex with base register with EBP. Add an offset to the offset.
@@ -553,7 +585,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
const MachineFrameInfo *MFI = MF.getFrameInfo();
FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
} else
- FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
if (BasePtr == StackPtr)
FIOffset += SPAdj;
@@ -592,193 +624,11 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
unsigned FrameReg = getFrameRegister(MF);
if (Subtarget.isTarget64BitILP32())
- FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false);
+ FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
}
-namespace llvm {
-unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT,
- bool High) {
- switch (VT) {
- default: return 0;
- case MVT::i8:
- if (High) {
- switch (Reg) {
- default: return getX86SubSuperRegister(Reg, MVT::i64);
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::SI;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::DI;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::BP;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::SP;
- case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
- return X86::AH;
- case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
- return X86::DH;
- case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
- return X86::CH;
- case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
- return X86::BH;
- }
- } else {
- switch (Reg) {
- default: return 0;
- case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
- return X86::AL;
- case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
- return X86::DL;
- case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
- return X86::CL;
- case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
- return X86::BL;
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::SIL;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::DIL;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::BPL;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::SPL;
- case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
- return X86::R8B;
- case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
- return X86::R9B;
- case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
- return X86::R10B;
- case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
- return X86::R11B;
- case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
- return X86::R12B;
- case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
- return X86::R13B;
- case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
- return X86::R14B;
- case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
- return X86::R15B;
- }
- }
- case MVT::i16:
- switch (Reg) {
- default: return 0;
- case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
- return X86::AX;
- case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
- return X86::DX;
- case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
- return X86::CX;
- case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
- return X86::BX;
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::SI;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::DI;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::BP;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::SP;
- case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
- return X86::R8W;
- case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
- return X86::R9W;
- case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
- return X86::R10W;
- case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
- return X86::R11W;
- case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
- return X86::R12W;
- case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
- return X86::R13W;
- case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
- return X86::R14W;
- case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
- return X86::R15W;
- }
- case MVT::i32:
- switch (Reg) {
- default: return 0;
- case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
- return X86::EAX;
- case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
- return X86::EDX;
- case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
- return X86::ECX;
- case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
- return X86::EBX;
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::ESI;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::EDI;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::EBP;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::ESP;
- case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
- return X86::R8D;
- case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
- return X86::R9D;
- case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
- return X86::R10D;
- case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
- return X86::R11D;
- case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
- return X86::R12D;
- case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
- return X86::R13D;
- case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
- return X86::R14D;
- case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
- return X86::R15D;
- }
- case MVT::i64:
- switch (Reg) {
- default: return 0;
- case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
- return X86::RAX;
- case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
- return X86::RDX;
- case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
- return X86::RCX;
- case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
- return X86::RBX;
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::RSI;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::RDI;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::RBP;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::RSP;
- case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
- return X86::R8;
- case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
- return X86::R9;
- case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
- return X86::R10;
- case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
- return X86::R11;
- case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
- return X86::R12;
- case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
- return X86::R13;
- case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
- return X86::R14;
- case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
- return X86::R15;
- }
- }
-}
-
-unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
- bool High) {
- unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High);
- if (Res == 0)
- llvm_unreachable("Unexpected register or VT");
- return Res;
-}
-
-unsigned get512BitSuperRegister(unsigned Reg) {
+unsigned llvm::get512BitSuperRegister(unsigned Reg) {
if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
return X86::ZMM0 + (Reg - X86::XMM0);
if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
@@ -787,5 +637,3 @@ unsigned get512BitSuperRegister(unsigned Reg) {
return Reg;
llvm_unreachable("Unexpected SIMD register");
}
-
-}
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 8de1d0bf8ec8..f014c8f6ff61 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -87,6 +87,11 @@ public:
const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+ /// getGPRsForTailCall - Returns a register class with registers that can be
+ /// used in forming tail calls.
+ const TargetRegisterClass *
+ getGPRsForTailCall(const MachineFunction &MF) const;
+
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
@@ -96,7 +101,11 @@ public:
getCalleeSavedRegs(const MachineFunction* MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
- const uint32_t *getNoPreservedMask() const;
+ const uint32_t *getNoPreservedMask() const override;
+
+ // Calls involved in thread-local variable lookup save more registers than
+ // normal calls, so they need a different mask to represent this.
+ const uint32_t *getDarwinTLSCallPreservedMask() const;
/// getReservedRegs - Returns a bitset indexed by physical register number
/// indicating if a register is a special register that has particular uses and
@@ -108,9 +117,7 @@ public:
bool hasBasePointer(const MachineFunction &MF) const;
- bool canRealignStack(const MachineFunction &MF) const;
-
- bool needsStackRealignment(const MachineFunction &MF) const override;
+ bool canRealignStack(const MachineFunction &MF) const override;
bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
int &FrameIdx) const override;
@@ -128,16 +135,6 @@ public:
unsigned getSlotSize() const { return SlotSize; }
};
-/// Returns the sub or super register of a specific X86 register.
-/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX.
-/// Aborts on error.
-unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
-
-/// Returns the sub or super register of a specific X86 register.
-/// Like getX86SubSuperRegister() but returns 0 on error.
-unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType,
- bool High = false);
-
//get512BitRegister - X86 utility - returns 512-bit super register
unsigned get512BitSuperRegister(unsigned Reg);
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index cdb151c26a05..56f0d9352d30 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -225,15 +225,15 @@ let SubRegIndices = [sub_ymm] in {
}
}
- // Mask Registers, used by AVX-512 instructions.
- def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
- def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
- def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
- def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
- def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
- def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
- def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
- def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
// Floating point stack registers. These don't map one-to-one to the FP
// pseudo registers, but we still mark them as aliasing FP registers. That
@@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
R8, R9, R11, RIP)>;
def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
- R8, R9, R11)>;
+ R8, R9, R10, R11, RIP)>;
// GR8_NOREX - GR8 registers which do not require a REX prefix.
def GR8_NOREX : RegisterClass<"X86", [i8], 8,
@@ -423,6 +423,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
// FIXME: This sets up the floating point register files as though they are f64
// values, though they really are f80 values. This will cause us to spill
@@ -442,10 +444,11 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
}
// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
128, (add FR32)>;
-def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 15)>;
// Status flags registers.
@@ -459,8 +462,8 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
}
// AVX-512 vector/mask registers.
-def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512,
- (sequence "ZMM%u", 0, 31)>;
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+ 512, (sequence "ZMM%u", 0, 31)>;
// Scalar AVX-512 floating point registers.
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
@@ -468,10 +471,10 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
// Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- 128, (add FR32X)>;
-def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- 256, (sequence "YMM%u", 0, 31)>;
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+ 128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+ 256, (sequence "YMM%u", 0, 31)>;
// Mask registers
def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;}
@@ -491,4 +494,4 @@ def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
// Bound registers
-def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; \ No newline at end of file
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce79fcf9ad81..b1a01614b4a1 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
return false;
}
-SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
- SDValue Chain,
- SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align,
- bool isVolatile,
- MachinePointerInfo DstPtrInfo) const {
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+ SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align, bool isVolatile,
+ MachinePointerInfo DstPtrInfo) const {
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
const X86Subtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
@@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
// Check to see if there is a specialized entry-point for memory zeroing.
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
- if (const char *bzeroEntry = V &&
+ if (const char *bzeroEntry = V &&
V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
- EVT IntPtr =
- DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
@@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
0)
.setDiscardResult();
- std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
+ std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
return CallResult.second;
}
@@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
BytesLeft = SizeVal % UBytes;
}
- Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
- InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+ InFlag);
InFlag = Chain.getValue(1);
} else {
AVT = MVT::i8;
@@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
CVT));
- Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
- X86::ECX,
- Left, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+ Left, InFlag);
InFlag = Chain.getValue(1);
Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
@@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
unsigned BytesLeft = SizeVal % UBytes;
SDValue InFlag;
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
- X86::ECX,
- Count, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
- X86::EDI,
- Dst, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+ Dst, InFlag);
InFlag = Chain.getValue(1);
- Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
- X86::ESI,
- Src, InFlag);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+ Src, InFlag);
InFlag = Chain.getValue(1);
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index dff3624b7efe..8ef08c960f0b 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -44,9 +44,8 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
cl::desc("Enable early if-conversion on X86"));
-/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
-/// current subtarget according to how we should reference it in a non-pcrel
-/// context.
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
if (isPICStyleGOT()) // 32-bit ELF targets.
return X86II::MO_GOTOFF;
@@ -58,9 +57,8 @@ unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
return X86II::MO_NO_FLAG;
}
-/// ClassifyGlobalReference - Classify a global variable reference for the
-/// current subtarget according to how we should reference it in a non-pcrel
-/// context.
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
unsigned char X86Subtarget::
ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
// DLLImport only exists on windows, it is implemented as a load from a
@@ -147,9 +145,9 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
}
-/// getBZeroEntry - This function returns the name of a function which has an
-/// interface like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over memset with zero
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
/// passed as the second argument. Otherwise it returns null.
const char *X86Subtarget::getBZeroEntry() const {
// Darwin 10 has a __bzero entry point for this purpose.
@@ -166,8 +164,7 @@ bool X86Subtarget::hasSinCos() const {
is64Bit();
}
-/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
-/// to immediate address.
+/// Return true if the subtarget allows calls to immediate address.
bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
// FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
// but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does,
@@ -192,9 +189,25 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
FullFS = "+64bit,+sse2";
}
+ // LAHF/SAHF are always supported in non-64-bit mode.
+ if (!In64BitMode) {
+ if (!FullFS.empty())
+ FullFS = "+sahf," + FullFS;
+ else
+ FullFS = "+sahf";
+ }
+
+
// Parse features string and set the CPU.
ParseSubtargetFeatures(CPUName, FullFS);
+ // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+ // 16-bytes and under that are reasonably fast. These features were
+ // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+ // micro-architectures respectively.
+ if (hasSSE42() || hasSSE4A())
+ IsUAMem16Slow = false;
+
InstrItins = getInstrItineraryForCPU(CPUName);
// It's important to keep the MCSubtargetInfo feature bits in sync with
@@ -224,13 +237,18 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
}
void X86Subtarget::initializeEnvironment() {
- X86SSELevel = NoMMXSSE;
+ X86SSELevel = NoSSE;
X863DNowLevel = NoThreeDNow;
HasCMov = false;
HasX86_64 = false;
HasPOPCNT = false;
HasSSE4A = false;
HasAES = false;
+ HasFXSR = false;
+ HasXSAVE = false;
+ HasXSAVEOPT = false;
+ HasXSAVEC = false;
+ HasXSAVES = false;
HasPCLMUL = false;
HasFMA = false;
HasFMA4 = false;
@@ -252,13 +270,15 @@ void X86Subtarget::initializeEnvironment() {
HasBWI = false;
HasVLX = false;
HasADX = false;
+ HasPKU = false;
HasSHA = false;
HasPRFCHW = false;
HasRDSEED = false;
+ HasLAHFSAHF = false;
HasMPX = false;
IsBTMemSlow = false;
IsSHLDSlow = false;
- IsUAMemFast = false;
+ IsUAMem16Slow = false;
IsUAMem32Slow = false;
HasSSEUnalignedMem = false;
HasCmpxchg16b = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index f026d4295f71..13d1026dcaa0 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -47,11 +47,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
protected:
enum X86SSEEnum {
- NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+ NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
};
enum X863DNowEnum {
- NoThreeDNow, ThreeDNow, ThreeDNowA
+ NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
};
enum X86ProcFamilyEnum {
@@ -64,10 +64,10 @@ protected:
/// Which PIC style to use
PICStyles::Style PICStyle;
- /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+ /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
X86SSEEnum X86SSELevel;
- /// 3DNow, 3DNow Athlon, or none supported.
+ /// MMX, 3DNow, 3DNow Athlon, or none supported.
X863DNowEnum X863DNowLevel;
/// True if this processor has conditional move instructions
@@ -86,6 +86,18 @@ protected:
/// Target has AES instructions
bool HasAES;
+ /// Target has FXSAVE/FXRESTOR instructions
+ bool HasFXSR;
+
+ /// Target has XSAVE instructions
+ bool HasXSAVE;
+ /// Target has XSAVEOPT instructions
+ bool HasXSAVEOPT;
+ /// Target has XSAVEC instructions
+ bool HasXSAVEC;
+ /// Target has XSAVES instructions
+ bool HasXSAVES;
+
/// Target has carry-less multiplication
bool HasPCLMUL;
@@ -140,16 +152,19 @@ protected:
/// Processor has RDSEED instructions.
bool HasRDSEED;
+ /// Processor has LAHF/SAHF instructions.
+ bool HasLAHFSAHF;
+
/// True if BT (bit test) of memory instructions are slow.
bool IsBTMemSlow;
/// True if SHLD instructions are slow.
bool IsSHLDSlow;
- /// True if unaligned memory access is fast.
- bool IsUAMemFast;
+ /// True if unaligned memory accesses of 16-bytes are slow.
+ bool IsUAMem16Slow;
- /// True if unaligned 32-byte memory accesses are slow.
+ /// True if unaligned memory accesses of 32-bytes are slow.
bool IsUAMem32Slow;
/// True if SSE operations can have unaligned memory operands.
@@ -208,6 +223,9 @@ protected:
/// Processor has AVX-512 Vector Length eXtenstions
bool HasVLX;
+ /// Processor has PKU extenstions
+ bool HasPKU;
+
/// Processot supports MPX - Memory Protection Extensions
bool HasMPX;
@@ -319,7 +337,6 @@ public:
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
bool hasCMov() const { return HasCMov; }
- bool hasMMX() const { return X86SSELevel >= MMX; }
bool hasSSE1() const { return X86SSELevel >= SSE1; }
bool hasSSE2() const { return X86SSELevel >= SSE2; }
bool hasSSE3() const { return X86SSELevel >= SSE3; }
@@ -332,14 +349,22 @@ public:
bool hasFp256() const { return hasAVX(); }
bool hasInt256() const { return hasAVX2(); }
bool hasSSE4A() const { return HasSSE4A; }
+ bool hasMMX() const { return X863DNowLevel >= MMX; }
bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasPOPCNT() const { return HasPOPCNT; }
bool hasAES() const { return HasAES; }
+ bool hasFXSR() const { return HasFXSR; }
+ bool hasXSAVE() const { return HasXSAVE; }
+ bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+ bool hasXSAVEC() const { return HasXSAVEC; }
+ bool hasXSAVES() const { return HasXSAVES; }
bool hasPCLMUL() const { return HasPCLMUL; }
- bool hasFMA() const { return HasFMA; }
- // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
- bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+ // Prefer FMA4 to FMA - its better for commutation/memory folding and
+ // has equal or better performance on all supported targets.
+ bool hasFMA() const { return HasFMA && !HasFMA4; }
+ bool hasFMA4() const { return HasFMA4; }
+ bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
bool hasXOP() const { return HasXOP; }
bool hasTBM() const { return HasTBM; }
bool hasMOVBE() const { return HasMOVBE; }
@@ -355,9 +380,10 @@ public:
bool hasSHA() const { return HasSHA; }
bool hasPRFCHW() const { return HasPRFCHW; }
bool hasRDSEED() const { return HasRDSEED; }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF; }
bool isBTMemSlow() const { return IsBTMemSlow; }
bool isSHLDSlow() const { return IsSHLDSlow; }
- bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+ bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
@@ -375,6 +401,7 @@ public:
bool hasDQI() const { return HasDQI; }
bool hasBWI() const { return HasBWI; }
bool hasVLX() const { return HasVLX; }
+ bool hasPKU() const { return HasPKU; }
bool hasMPX() const { return HasMPX; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
@@ -394,9 +421,11 @@ public:
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+ bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
bool isTargetWindowsMSVC() const {
return TargetTriple.isWindowsMSVCEnvironment();
@@ -406,6 +435,10 @@ public:
return TargetTriple.isKnownWindowsMSVCEnvironment();
}
+ bool isTargetWindowsCoreCLR() const {
+ return TargetTriple.isWindowsCoreCLREnvironment();
+ }
+
bool isTargetWindowsCygwin() const {
return TargetTriple.isWindowsCygwinEnvironment();
}
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index fb9cb4ba4c86..0e7e4c0c84a9 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -28,10 +28,17 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
extern "C" void LLVMInitializeX86Target() {
// Register the target.
RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+
+ PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeWinEHStatePassPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -45,7 +52,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return make_unique<X86LinuxNaClTargetObjectFile>();
if (TT.isOSBinFormatELF())
return make_unique<X86ELFTargetObjectFile>();
- if (TT.isKnownWindowsMSVCEnvironment())
+ if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
return make_unique<X86WindowsTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
return make_unique<TargetLoweringObjectFileCOFF>();
@@ -175,8 +182,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
//===----------------------------------------------------------------------===//
TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis(
- [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); });
+ return TargetIRAnalysis([this](const Function &F) {
+ return TargetTransformInfo(X86TTIImpl(this, F));
+ });
}
@@ -246,6 +254,9 @@ bool X86PassConfig::addPreISel() {
}
void X86PassConfig::addPreRegAlloc() {
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createX86OptimizeLEAs());
+
addPass(createX86CallFrameOptimization());
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 6f900ea351ef..782768d0ab16 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCSectionCOFF.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
#include "llvm/Support/Dwarf.h"
#include "llvm/Target/TargetLowering.h"
@@ -152,9 +153,8 @@ static std::string scalarConstantToHexString(const Constant *C) {
}
}
-MCSection *
-X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
- const Constant *C) const {
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C) const {
if (Kind.isMergeableConst() && C) {
const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
COFF::IMAGE_SCN_MEM_READ |
@@ -171,5 +171,5 @@ X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
COFF::IMAGE_COMDAT_SELECT_ANY);
}
- return TargetLoweringObjectFile::getSectionForConstant(Kind, C);
+ return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C);
}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 66366b2373cd..6b2448cc9de6 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -58,7 +58,7 @@ namespace llvm {
/// \brief Given a mergeable constant with the specified size and relocation
/// information, return a section that it should be placed in.
- MCSection *getSectionForConstant(SectionKind Kind,
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
const Constant *C) const override;
};
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 7df726091843..2e7bbb208743 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
+
using namespace llvm;
#define DEBUG_TYPE "x86tti"
@@ -62,8 +63,8 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
if (ST->is64Bit())
return 64;
- return 32;
+ return 32;
}
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
@@ -84,12 +85,12 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 2;
}
-unsigned X86TTIImpl::getArithmeticInstrCost(
+int X86TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -101,10 +102,9 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties many not be same as that of previous
// operation;conservatively assume OP_None.
- unsigned Cost =
- 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+ Op2Info, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -115,8 +115,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
return Cost;
}
- static const CostTblEntry<MVT::SimpleValueType>
- AVX2UniformConstCostTable[] = {
+ static const CostTblEntry AVX2UniformConstCostTable[] = {
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
@@ -127,12 +126,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasAVX2()) {
- int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
- if (Idx != -1)
- return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
}
- static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = {
+ static const CostTblEntry AVX512CostTable[] = {
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i32, 1 },
@@ -141,7 +140,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v8i64, 1 },
};
- static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
+ if (ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, 1 },
@@ -154,7 +158,57 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v2i64, 1 },
{ ISD::SHL, MVT::v4i64, 1 },
{ ISD::SRL, MVT::v4i64, 1 },
+ };
+
+ // Look for AVX2 lowering tricks.
+ if (ST->hasAVX2()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX2, a packed v16i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return LT.first;
+
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry XOPCostTable[] = {
+ // 128bit shifts take 1cy, but right shifts require negation beforehand.
+ { ISD::SHL, MVT::v16i8, 1 },
+ { ISD::SRL, MVT::v16i8, 2 },
+ { ISD::SRA, MVT::v16i8, 2 },
+ { ISD::SHL, MVT::v8i16, 1 },
+ { ISD::SRL, MVT::v8i16, 2 },
+ { ISD::SRA, MVT::v8i16, 2 },
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 2 },
+ { ISD::SRA, MVT::v4i32, 2 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 2 },
+ { ISD::SRA, MVT::v2i64, 2 },
+ // 256bit shifts require splitting if AVX2 didn't catch them above.
+ { ISD::SHL, MVT::v32i8, 2 },
+ { ISD::SRL, MVT::v32i8, 4 },
+ { ISD::SRA, MVT::v32i8, 4 },
+ { ISD::SHL, MVT::v16i16, 2 },
+ { ISD::SRL, MVT::v16i16, 4 },
+ { ISD::SRA, MVT::v16i16, 4 },
+ { ISD::SHL, MVT::v8i32, 2 },
+ { ISD::SRL, MVT::v8i32, 4 },
+ { ISD::SRA, MVT::v8i32, 4 },
+ { ISD::SHL, MVT::v4i64, 2 },
+ { ISD::SRL, MVT::v4i64, 4 },
+ { ISD::SRA, MVT::v4i64, 4 },
+ };
+ // Look for XOP lowering tricks.
+ if (ST->hasXOP()) {
+ if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2CustomCostTable[] = {
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
@@ -163,7 +217,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v32i8, 32*20 },
@@ -176,44 +231,44 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
{ ISD::UDIV, MVT::v4i64, 4*20 },
};
- if (ST->hasAVX512()) {
- int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
- if (Idx != -1)
- return LT.first * AVX512CostTable[Idx].Cost;
- }
- // Look for AVX2 lowering tricks.
+ // Look for AVX2 lowering tricks for custom cases.
if (ST->hasAVX2()) {
- if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
- (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
- Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
- // On AVX2, a packed v16i16 shift left by a constant build_vector
- // is lowered into a vector multiply (vpmullw).
- return LT.first;
-
- int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
- if (Idx != -1)
- return LT.first * AVX2CostTable[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
}
- static const CostTblEntry<MVT::SimpleValueType>
+ static const CostTblEntry
SSE2UniformConstCostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
// Constant splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i8, 1 }, // psllw.
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw.
{ ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v16i16, 2 }, // psllw.
{ ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v8i32, 2 }, // pslld
{ ISD::SHL, MVT::v2i64, 1 }, // psllq.
+ { ISD::SHL, MVT::v4i64, 2 }, // psllq.
{ ISD::SRL, MVT::v16i8, 1 }, // psrlw.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
{ ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
{ ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v8i32, 2 }, // psrld.
{ ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+ { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+ { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v16i16, 2 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ { ISD::SRA, MVT::v8i32, 2 }, // psrad.
{ ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
@@ -227,27 +282,34 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
- int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
- if (Idx != -1)
- return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
}
if (ISD == ISD::SHL &&
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
- EVT VT = LT.second;
+ MVT VT = LT.second;
+ // Vector shift left by non uniform constant can be lowered
+ // into vector multiply (pmullw/pmulld).
if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
(VT == MVT::v4i32 && ST->hasSSE41()))
- // Vector shift left by non uniform constant can be lowered
- // into vector multiply (pmullw/pmulld).
return LT.first;
+
+ // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+ // sequence of extract + two vector multiply + insert.
+ if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+ (ST->hasAVX() && !ST->hasAVX2()))
+ ISD = ISD::MUL;
+
+ // A vector shift left by non uniform constant is converted
+ // into a vector multiply; the new multiply is eventually
+ // lowered into a sequence of shuffles and 2 x pmuludq.
if (VT == MVT::v4i32 && ST->hasSSE2())
- // A vector shift left by non uniform constant is converted
- // into a vector multiply; the new multiply is eventually
- // lowered into a sequence of shuffles and 2 x pmuludq.
ISD = ISD::MUL;
}
- static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
+ static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
// For some cases, where the shift amount is a scalar we would be able
@@ -257,20 +319,31 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
// used for vectorization and we don't want to make vectorized code worse
// than scalar code.
{ ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
- { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
- { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized.
+ { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
+ { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
{ ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
+ { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
@@ -289,12 +362,11 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
};
if (ST->hasSSE2()) {
- int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
- if (Idx != -1)
- return LT.first * SSE2CostTable[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
- static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
+ static const CostTblEntry AVX1CostTable[] = {
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
@@ -314,29 +386,21 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
// Look for AVX1 lowering tricks.
if (ST->hasAVX() && !ST->hasAVX2()) {
- EVT VT = LT.second;
+ MVT VT = LT.second;
- // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
- // sequence of extract + two vector multiply + insert.
- if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
- Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
- ISD = ISD::MUL;
-
- int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
- if (Idx != -1)
- return LT.first * AVX1CostTable[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+ return LT.first * Entry->Cost;
}
// Custom lowering of vectors.
- static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
+ static const CostTblEntry CustomLowered[] = {
// A v2i64/v4i64 and multiply is custom lowered as a series of long
// multiplies(3), shifts(4) and adds(2).
{ ISD::MUL, MVT::v2i64, 9 },
{ ISD::MUL, MVT::v4i64, 9 },
};
- int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
- if (Idx != -1)
- return LT.first * CustomLowered[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+ return LT.first * Entry->Cost;
// Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
// 2x pmuludq, 2x shuffle.
@@ -348,15 +412,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
}
-unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
// We only estimate the cost of reverse and alternate shuffles.
if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
if (Kind == TTI::SK_Reverse) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- unsigned Cost = 1;
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ int Cost = 1;
if (LT.second.getSizeInBits() > 128)
Cost = 3; // Extract + insert + copy.
@@ -367,14 +431,14 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (Kind == TTI::SK_Alternate) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// The backend knows how to generate a single VEX.256 version of
// instruction VPBLENDW if the target supports AVX2.
if (ST->hasAVX2() && LT.second == MVT::v16i16)
return LT.first;
- static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
+ static const CostTblEntry AVXAltShuffleTbl[] = {
{ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
{ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
@@ -390,13 +454,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
};
- if (ST->hasAVX()) {
- int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx != -1)
- return LT.first * AVXAltShuffleTbl[Idx].Cost;
- }
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
+ static const CostTblEntry SSE41AltShuffleTbl[] = {
// These are lowered into movsd.
{ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
@@ -414,13 +477,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
};
- if (ST->hasSSE41()) {
- int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx != -1)
- return LT.first * SSE41AltShuffleTbl[Idx].Cost;
- }
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
+ static const CostTblEntry SSSE3AltShuffleTbl[] = {
{ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
{ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
@@ -433,13 +495,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
};
- if (ST->hasSSSE3()) {
- int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx != -1)
- return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
- }
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
+ static const CostTblEntry SSEAltShuffleTbl[] = {
{ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
{ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
@@ -454,65 +515,47 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
};
// Fall-back (SSE3 and SSE2).
- int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx != -1)
- return LT.first * SSEAltShuffleTbl[Idx].Cost;
+ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
- std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
-
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- SSE2ConvTbl[] = {
- // These are somewhat magic numbers justified by looking at the output of
- // Intel's IACA, running some kernels and making sure when we take
- // legalization into account the throughput will be overestimated.
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- // There are faster sequences for float conversions.
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ // FIXME: Need a better design of the cost table to handle non-simple types of
+ // potential massive combinations (elem_num x src_type x dst_type).
+
+ static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
};
- if (ST->hasSSE2() && !ST->hasAVX()) {
- int Idx =
- ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
- if (Idx != -1)
- return LTSrc.first * SSE2ConvTbl[Idx].Cost;
- }
-
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- AVX512ConversionTbl[] = {
+ static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
- { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
- { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 },
// v16i1 -> v16i32 - load + broadcast
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
@@ -522,33 +565,49 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 },
- { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
- };
- if (ST->hasAVX512()) {
- int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second,
- LTSrc.second);
- if (Idx != -1)
- return AVX512ConversionTbl[Idx].Cost;
- }
- EVT SrcTy = TLI->getValueType(DL, Src);
- EVT DstTy = TLI->getValueType(DL, Dst);
-
- // The function getSimpleVT only handles simple value types.
- if (!SrcTy.isSimple() || !DstTy.isSimple())
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 },
+
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ };
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- AVX2ConversionTbl[] = {
+ static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
@@ -579,8 +638,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
};
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
- AVXConversionTbl[] = {
+ static const TypeConversionCostTblEntry AVXConversionTbl[] = {
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
@@ -650,34 +708,158 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
};
+ static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
+ };
+
+ static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+ // These are somewhat magic numbers justified by looking at the output of
+ // Intel's IACA, running some kernels and making sure when we take
+ // legalization into account the throughput will be overestimated.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+ // There are faster sequences for float conversions.
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
+
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
+ };
+
+ std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->hasSSE2() && !ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ return LTSrc.first * Entry->Cost;
+ }
+
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
+
+ // The function getSimpleVT only handles simple value types.
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
if (ST->hasAVX2()) {
- int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
- DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return AVX2ConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
if (ST->hasAVX()) {
- int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
- SrcTy.getSimpleVT());
- if (Idx != -1)
- return AVXConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
}
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
+ static const CostTblEntry SSE42CostTbl[] = {
{ ISD::SETCC, MVT::v2f64, 1 },
{ ISD::SETCC, MVT::v4f32, 1 },
{ ISD::SETCC, MVT::v2i64, 1 },
@@ -686,7 +868,7 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i8, 1 },
};
- static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
+ static const CostTblEntry AVX1CostTbl[] = {
{ ISD::SETCC, MVT::v4f64, 1 },
{ ISD::SETCC, MVT::v8f32, 1 },
// AVX1 does not support 8-wide integer compare.
@@ -696,54 +878,45 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v32i8, 4 },
};
- static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
+ static const CostTblEntry AVX2CostTbl[] = {
{ ISD::SETCC, MVT::v4i64, 1 },
{ ISD::SETCC, MVT::v8i32, 1 },
{ ISD::SETCC, MVT::v16i16, 1 },
{ ISD::SETCC, MVT::v32i8, 1 },
};
- static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = {
+ static const CostTblEntry AVX512CostTbl[] = {
{ ISD::SETCC, MVT::v8i64, 1 },
{ ISD::SETCC, MVT::v16i32, 1 },
{ ISD::SETCC, MVT::v8f64, 1 },
{ ISD::SETCC, MVT::v16f32, 1 },
};
- if (ST->hasAVX512()) {
- int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX512CostTbl[Idx].Cost;
- }
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX2()) {
- int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX2CostTbl[Idx].Cost;
- }
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasAVX()) {
- int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX1CostTbl[Idx].Cost;
- }
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE42()) {
- int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
- if (Idx != -1)
- return LT.first * SSE42CostTbl[Idx].Cost;
- }
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -761,10 +934,9 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return BaseT::getVectorInstrCost(Opcode, Val, Index);
}
-unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
- bool Extract) {
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
assert (Ty->isVectorTy() && "Can only scalarize vectors");
- unsigned Cost = 0;
+ int Cost = 0;
for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
if (Insert)
@@ -776,9 +948,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
return Cost;
}
-unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) {
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) {
// Handle non-power-of-two vectors such as <3 x float>
if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
unsigned NumElem = VTy->getVectorNumElements();
@@ -796,22 +967,21 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {
- unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(),
- Alignment, AddressSpace);
- unsigned SplitCost = getScalarizationOverhead(Src,
- Opcode == Instruction::Load,
- Opcode==Instruction::Store);
+ int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+ AddressSpace);
+ int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+ Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;
}
}
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Invalid Opcode");
// Each load/store unit costs 1.
- unsigned Cost = LT.first * 1;
+ int Cost = LT.first * 1;
// On Sandybridge 256bit load/stores are double pumped
// (but not on Haswell).
@@ -821,9 +991,9 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return Cost;
}
-unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
- unsigned Alignment,
- unsigned AddressSpace) {
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+ unsigned Alignment,
+ unsigned AddressSpace) {
VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
@@ -832,34 +1002,33 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
- if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
- (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
+ if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
!isPowerOf2_32(NumElem)) {
// Scalarization
- unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
- unsigned ScalarCompareCost =
- getCmpSelInstrCost(Instruction::ICmp,
- Type::getInt8Ty(getGlobalContext()), NULL);
- unsigned BranchCost = getCFInstrCost(Instruction::Br);
- unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
- unsigned ValueSplitCost =
- getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load,
- Opcode == Instruction::Store);
- unsigned MemopCost =
+ int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+ int ValueSplitCost = getScalarizationOverhead(
+ SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+ int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
- unsigned Cost = 0;
- if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ auto VT = TLI->getValueType(DL, SrcVTy);
+ int Cost = 0;
+ if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
- getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0);
+ Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+ getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
@@ -874,7 +1043,7 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
return Cost+LT.first;
}
-unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -887,10 +1056,10 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
return BaseT::getAddressComputationCost(Ty, IsComplex);
}
-unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
- bool IsPairwise) {
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+ bool IsPairwise) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -900,7 +1069,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+ static const CostTblEntry SSE42CostTblPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
@@ -908,7 +1077,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
{ ISD::ADD, MVT::v8i16, 5 },
};
- static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+ static const CostTblEntry AVX1CostTblPairWise[] = {
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::FADD, MVT::v4f64, 5 },
{ ISD::FADD, MVT::v8f32, 7 },
@@ -919,7 +1088,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
{ ISD::ADD, MVT::v8i32, 5 },
};
- static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
@@ -927,7 +1096,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
{ ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
};
- static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v4f32, 3 },
{ ISD::FADD, MVT::v4f64, 3 },
{ ISD::FADD, MVT::v8f32, 4 },
@@ -939,29 +1108,21 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
};
if (IsPairwise) {
- if (ST->hasAVX()) {
- int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX1CostTblPairWise[Idx].Cost;
- }
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE42()) {
- int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
- if (Idx != -1)
- return LT.first * SSE42CostTblPairWise[Idx].Cost;
- }
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
} else {
- if (ST->hasAVX()) {
- int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
- }
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE42()) {
- int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
- if (Idx != -1)
- return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
- }
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
}
return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
@@ -970,7 +1131,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-unsigned X86TTIImpl::getIntImmCost(int64_t Val) {
+int X86TTIImpl::getIntImmCost(int64_t Val) {
if (Val == 0)
return TTI::TCC_Free;
@@ -980,7 +1141,7 @@ unsigned X86TTIImpl::getIntImmCost(int64_t Val) {
return 2 * TTI::TCC_Basic;
}
-unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1004,18 +1165,18 @@ unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- unsigned Cost = 0;
+ int Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
- return std::max(1U, Cost);
+ return std::max(1, Cost);
}
-unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1038,6 +1199,26 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
case Instruction::Store:
ImmIdx = 0;
break;
+ case Instruction::ICmp:
+ // This is an imperfect hack to prevent constant hoisting of
+ // compares that might be trying to check if a 64-bit value fits in
+ // 32-bits. The backend can optimize these cases using a right shift by 32.
+ // Ideally we would check the compare predicate here. There also other
+ // similar immediates the backend can use shifts for.
+ if (Idx == 1 && Imm.getBitWidth() == 64) {
+ uint64_t ImmVal = Imm.getZExtValue();
+ if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+ return TTI::TCC_Free;
+ }
+ ImmIdx = 1;
+ break;
+ case Instruction::And:
+ // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+ // by using a 32-bit operation with implicit zero extension. Detect such
+ // immediates here as the normal path expects bit 31 to be sign extended.
+ if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+ return TTI::TCC_Free;
+ // Fallthrough
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
@@ -1045,10 +1226,8 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- case Instruction::ICmp:
ImmIdx = 1;
break;
// Always return TCC_Free for the shift value of a shift instruction.
@@ -1073,18 +1252,18 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
}
if (Idx == ImmIdx) {
- unsigned NumConstants = (BitSize + 63) / 64;
- unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
- ? static_cast<unsigned>(TTI::TCC_Free)
+ ? static_cast<int>(TTI::TCC_Free)
: Cost;
}
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
-unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -1118,23 +1297,181 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
- int DataWidth = DataTy->getPrimitiveSizeInBits();
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace) {
+
+ assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ // Try to reduce index size from 64 bit (default for GEP)
+ // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+ // operation will use 16 x 64 indices which do not fit in a zmm and needs
+ // to split. Also check that the base pointer is the same for all lanes,
+ // and that there's at most one variable index.
+ auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ unsigned IndexSize = DL.getPointerSizeInBits();
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (IndexSize < 64 || !GEP)
+ return IndexSize;
+
+ unsigned NumOfVarIndices = 0;
+ Value *Ptrs = GEP->getPointerOperand();
+ if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+ return IndexSize;
+ for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+ if (isa<Constant>(GEP->getOperand(i)))
+ continue;
+ Type *IndxTy = GEP->getOperand(i)->getType();
+ if (IndxTy->isVectorTy())
+ IndxTy = IndxTy->getVectorElementType();
+ if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+ !isa<SExtInst>(GEP->getOperand(i))) ||
+ ++NumOfVarIndices > 1)
+ return IndexSize; // 64
+ }
+ return (unsigned)32;
+ };
+
+
+ // Trying to reduce IndexSize to 32 bits for vector 16.
+ // By default the IndexSize is equal to pointer size.
+ unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+ DL.getPointerSizeInBits();
+
+ Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+ IndexSize), VF);
+ std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ if (SplitFactor > 1) {
+ // Handle splitting of vector of pointers
+ Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+ AddressSpace);
+ }
+
+ // The gather / scatter cost is given by Intel architects. It is a rough
+ // number since we are looking at one instruction in a time.
+ const int GSOverhead = 2;
+ return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, unsigned Alignment,
+ unsigned AddressSpace) {
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ int MaskUnpackCost = 0;
+ if (VariableMask) {
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+ MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost =
+ getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+ nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+ }
- // Todo: AVX512 allows gather/scatter, works with strided and random as well
- if ((DataWidth < 32) || (Consecutive == 0))
+ // The cost of the scalar loads/stores.
+ int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+
+ int InsertExtractCost = 0;
+ if (Opcode == Instruction::Load)
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of inserting each scalar load into the vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+ else
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of extracting each element out of the data vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+ return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) {
+ assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+ unsigned VF = SrcVTy->getVectorNumElements();
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ if (!PtrTy && Ptr->getType()->isVectorTy())
+ PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ assert(PtrTy && "Unexpected type for Ptr argument");
+ unsigned AddressSpace = PtrTy->getAddressSpace();
+
+ bool Scalarize = false;
+ if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+ Scalarize = true;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
+ // better in the VariableMask case.
+ if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ Scalarize = true;
+
+ if (Scalarize)
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+
+ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ return (DataWidth >= 32 && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+ return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+ // This function is called now in two cases: from the Loop Vectorizer
+ // and from the Scalarizer.
+ // When the Loop Vectorizer asks about legality of the feature,
+ // the vectorization factor is not calculated yet. The Loop Vectorizer
+ // sends a scalar type and the decision is based on the width of the
+ // scalar element.
+ // Later on, the cost model will estimate usage this intrinsic based on
+ // the vector type.
+ // The Scalarizer asks again about legality. It sends a vector type.
+ // In this case we can reject non-power-of-2 vectors.
+ if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
return false;
- if (ST->hasAVX512() || ST->hasAVX2())
- return true;
- return false;
+ Type *ScalarTy = DataTy->getScalarType();
+ int DataWidth = isa<PointerType>(ScalarTy) ?
+ DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+ // AVX-512 allows gather and scatter
+ return DataWidth >= 32 && ST->hasAVX512();
}
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) {
- return isLegalMaskedLoad(DataType, Consecutive);
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+ return isLegalMaskedGather(DataType);
}
-bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller,
- const Function *Callee) const {
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
// Work this as a subsetting of subtarget features.
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index da3f36c2e27e..adb745e912d1 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -33,13 +33,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const X86Subtarget *ST;
const X86TargetLowering *TLI;
- unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+ int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
const X86Subtarget *getST() const { return ST; }
const X86TargetLowering *getTLI() const { return TLI; }
public:
- explicit X86TTIImpl(const X86TargetMachine *TM, Function &F)
+ explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -62,38 +62,44 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
- unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
- unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
-
- unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex);
-
- unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
-
- unsigned getIntImmCost(int64_t);
-
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
-
- unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
- unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
- bool isLegalMaskedLoad(Type *DataType, int Consecutive);
- bool isLegalMaskedStore(Type *DataType, int Consecutive);
- bool hasCompatibleFunctionAttributes(const Function *Caller,
- const Function *Callee) const;
+ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment);
+ int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+ int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+ int getIntImmCost(int64_t);
+
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
+ bool isLegalMaskedLoad(Type *DataType);
+ bool isLegalMaskedStore(Type *DataType);
+ bool isLegalMaskedGather(Type *DataType);
+ bool isLegalMaskedScatter(Type *DataType);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+private:
+ int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+ unsigned Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace);
/// @}
};
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 9190d0be9e4d..dce94a9e9ef7 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -15,7 +15,8 @@
//===----------------------------------------------------------------------===//
#include "X86.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
@@ -38,12 +39,16 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "winehstate"
+namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); }
+
namespace {
class WinEHStatePass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid.
- WinEHStatePass() : FunctionPass(ID) {}
+ WinEHStatePass() : FunctionPass(ID) {
+ initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+ }
bool runOnFunction(Function &Fn) override;
@@ -62,18 +67,13 @@ private:
void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
void unlinkExceptionRegistration(IRBuilder<> &Builder);
- void addCXXStateStores(Function &F, MachineModuleInfo &MMI);
- void addSEHStateStores(Function &F, MachineModuleInfo &MMI);
- void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo,
- Function &F, int BaseState);
+ void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
Function *generateLSDAInEAXThunk(Function *ParentFunc);
- int escapeRegNode(Function &F);
-
// Module-level type getters.
Type *getEHLinkRegistrationType();
Type *getSEHRegistrationType();
@@ -111,6 +111,9 @@ FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
char WinEHStatePass::ID = 0;
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+ "Insert stores for EH state numbers", false, false)
+
bool WinEHStatePass::doInitialization(Module &M) {
TheModule = &M;
FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
@@ -138,14 +141,7 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
}
bool WinEHStatePass::runOnFunction(Function &F) {
- // If this is an outlined handler, don't do anything. We'll do state insertion
- // for it in the parent.
- StringRef WinEHParentName =
- F.getFnAttribute("wineh-parent").getValueAsString();
- if (WinEHParentName != F.getName() && !WinEHParentName.empty())
- return false;
-
- // Check the personality. Do nothing if this is not an MSVC personality.
+ // Check the personality. Do nothing if this personality doesn't use funclets.
if (!F.hasPersonalityFn())
return false;
PersonalityFn =
@@ -153,7 +149,19 @@ bool WinEHStatePass::runOnFunction(Function &F) {
if (!PersonalityFn)
return false;
Personality = classifyEHPersonality(PersonalityFn);
- if (!isMSVCEHPersonality(Personality))
+ if (!isFuncletEHPersonality(Personality))
+ return false;
+
+ // Skip this function if there are no EH pads and we aren't using IR-level
+ // outlining.
+ bool HasPads = false;
+ for (BasicBlock &BB : F) {
+ if (BB.isEHPad()) {
+ HasPads = true;
+ break;
+ }
+ }
+ if (!HasPads)
return false;
// Disable frame pointer elimination in this function.
@@ -163,14 +171,13 @@ bool WinEHStatePass::runOnFunction(Function &F) {
emitExceptionRegistrationRecord(&F);
- auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>();
- assert(MMIPtr && "MachineModuleInfo should always be available");
- MachineModuleInfo &MMI = *MMIPtr;
- switch (Personality) {
- default: llvm_unreachable("unexpected personality function");
- case EHPersonality::MSVC_CXX: addCXXStateStores(F, MMI); break;
- case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break;
- }
+ // The state numbers calculated here in IR must agree with what we calculate
+ // later on for the MachineFunction. In particular, if an IR pass deletes an
+ // unreachable EH pad after this point before machine CFG construction, we
+ // will be in trouble. If this assumption is ever broken, we should turn the
+ // numbers into an immutable analysis pass.
+ WinEHFuncInfo FuncInfo;
+ addStateStores(F, FuncInfo);
// Reset per-function state.
PersonalityFn = nullptr;
@@ -261,7 +268,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
// TryLevel = -1
StateFieldIndex = 2;
- insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1);
+ insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1);
// Handler = __ehhandler$F
Function *Trampoline = generateLSDAInEAXThunk(F);
Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
@@ -278,7 +285,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
// TryLevel = -2 / -1
StateFieldIndex = 4;
- insertStateNumberStore(RegNode, Builder.GetInsertPoint(),
+ insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(),
UseStackGuard ? -2 : -1);
// ScopeTable = llvm.x86.seh.lsda(F)
Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
@@ -347,7 +354,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
Value *CastPersonality =
Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
auto AI = Trampoline->arg_begin();
- Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++};
+ Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
CallInst *Call = Builder.CreateCall(CastPersonality, Args);
// Can't use musttail due to prototype mismatch, but we can use tail.
Call->setTailCall(true);
@@ -391,160 +398,53 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
Builder.CreateStore(Next, FSZero);
}
-void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) {
- WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F);
- calculateWinCXXEHStateNumbers(&F, FuncInfo);
-
- // The base state for the parent is -1.
- addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1);
-
- // Set up RegNodeEscapeIndex
- int RegNodeEscapeIndex = escapeRegNode(F);
- FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex;
-
- // Only insert stores in catch handlers.
- Constant *FI8 =
- ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext()));
- for (auto P : FuncInfo.HandlerBaseState) {
- Function *Handler = const_cast<Function *>(P.first);
- int BaseState = P.second;
- IRBuilder<> Builder(&Handler->getEntryBlock(),
- Handler->getEntryBlock().begin());
- // FIXME: Find and reuse such a call if present.
- Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)});
- Value *RecoveredRegNode = Builder.CreateCall(
- FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)});
- RecoveredRegNode =
- Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0));
- addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState);
- }
-}
-
-/// Escape RegNode so that we can access it from child handlers. Find the call
-/// to localescape, if any, in the entry block and append RegNode to the list
-/// of arguments.
-int WinEHStatePass::escapeRegNode(Function &F) {
- // Find the call to localescape and extract its arguments.
- IntrinsicInst *EscapeCall = nullptr;
- for (Instruction &I : F.getEntryBlock()) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (II && II->getIntrinsicID() == Intrinsic::localescape) {
- EscapeCall = II;
- break;
- }
- }
- SmallVector<Value *, 8> Args;
- if (EscapeCall) {
- auto Ops = EscapeCall->arg_operands();
- Args.append(Ops.begin(), Ops.end());
- }
- Args.push_back(RegNode);
-
- // Replace the call (if it exists) with new one. Otherwise, insert at the end
- // of the entry block.
- Instruction *InsertPt = EscapeCall;
- if (!EscapeCall)
- InsertPt = F.getEntryBlock().getTerminator();
- IRBuilder<> Builder(&F.getEntryBlock(), InsertPt);
- Builder.CreateCall(FrameEscape, Args);
- if (EscapeCall)
- EscapeCall->eraseFromParent();
- return Args.size() - 1;
-}
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+ // Mark the registration node. The backend needs to know which alloca it is so
+ // that it can recover the original frame pointer.
+ IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator()));
+ Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+ {RegNodeI8});
+
+ // Calculate state numbers.
+ if (isAsynchronousEHPersonality(Personality))
+ calculateSEHStateNumbers(&F, FuncInfo);
+ else
+ calculateWinCXXEHStateNumbers(&F, FuncInfo);
-void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode,
- WinEHFuncInfo &FuncInfo,
- Function &F, int BaseState) {
// Iterate all the instructions and emit state number stores.
+ DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
for (BasicBlock &BB : F) {
+ // Figure out what state we should assign calls in this block.
+ int BaseState = -1;
+ auto &BBColors = BlockColors[&BB];
+
+ assert(BBColors.size() == 1 &&
+ "multi-color BB not removed by preparation");
+ BasicBlock *FuncletEntryBB = BBColors.front();
+ if (auto *FuncletPad =
+ dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+ auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+ if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+ BaseState = BaseStateI->second;
+ }
+
for (Instruction &I : BB) {
if (auto *CI = dyn_cast<CallInst>(&I)) {
// Possibly throwing call instructions have no actions to take after
// an unwind. Ensure they are in the -1 state.
if (CI->doesNotThrow())
continue;
- insertStateNumberStore(ParentRegNode, CI, BaseState);
+ insertStateNumberStore(RegNode, CI, BaseState);
} else if (auto *II = dyn_cast<InvokeInst>(&I)) {
// Look up the state number of the landingpad this unwinds to.
- LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst();
- // FIXME: Why does this assertion fail?
- //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!");
- int State = FuncInfo.LandingPadStateMap[LPI];
- insertStateNumberStore(ParentRegNode, II, State);
- }
- }
- }
-}
-
-/// Assign every distinct landingpad a unique state number for SEH. Unlike C++
-/// EH, we can use this very simple algorithm while C++ EH cannot because catch
-/// handlers aren't outlined and the runtime doesn't have to figure out which
-/// catch handler frame to unwind to.
-/// FIXME: __finally blocks are outlined, so this approach may break down there.
-void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) {
- WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F);
-
- // Remember and return the index that we used. We save it in WinEHFuncInfo so
- // that we can lower llvm.x86.seh.recoverfp later in filter functions without
- // too much trouble.
- int RegNodeEscapeIndex = escapeRegNode(F);
- FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex;
-
- // Iterate all the instructions and emit state number stores.
- int CurState = 0;
- SmallPtrSet<BasicBlock *, 4> ExceptBlocks;
- for (BasicBlock &BB : F) {
- for (auto I = BB.begin(), E = BB.end(); I != E; ++I) {
- if (auto *CI = dyn_cast<CallInst>(I)) {
- auto *Intrin = dyn_cast<IntrinsicInst>(CI);
- if (Intrin) {
- // Calls that "don't throw" are considered to be able to throw asynch
- // exceptions, but intrinsics cannot.
- continue;
- }
- insertStateNumberStore(RegNode, CI, -1);
- } else if (auto *II = dyn_cast<InvokeInst>(I)) {
- // Look up the state number of the landingpad this unwinds to.
- LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst();
- auto InsertionPair =
- FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState));
- auto Iter = InsertionPair.first;
- int &State = Iter->second;
- bool Inserted = InsertionPair.second;
- if (Inserted) {
- // Each action consumes a state number.
- auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode());
- SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList;
- parseEHActions(EHActions, ActionList);
- assert(!ActionList.empty());
- CurState += ActionList.size();
- State += ActionList.size() - 1;
-
- // Remember all the __except block targets.
- for (auto &Handler : ActionList) {
- if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) {
- auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc());
-#ifndef NDEBUG
- for (BasicBlock *Pred : predecessors(BA->getBasicBlock()))
- assert(Pred->isLandingPad() &&
- "WinEHPrepare failed to split block");
-#endif
- ExceptBlocks.insert(BA->getBasicBlock());
- }
- }
- }
+ assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+ int State = FuncInfo.InvokeStateMap[II];
insertStateNumberStore(RegNode, II, State);
}
}
}
-
- // Insert llvm.x86.seh.restoreframe() into each __except block.
- Function *RestoreFrame =
- Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe);
- for (BasicBlock *ExceptBB : ExceptBlocks) {
- IRBuilder<> Builder(ExceptBB->begin());
- Builder.CreateCall(RestoreFrame, {});
- }
}
void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 2e44ac949b2c..aaf267af5311 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -224,7 +224,7 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder) {
if (Val > 11)
return MCDisassembler::Fail;
- static unsigned Values[] = {
+ static const unsigned Values[] = {
32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
};
Inst.addOperand(MCOperand::createImm(Values[Val]));
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 6fd2dec1d13e..dc513f7b225b 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -19,8 +19,6 @@
namespace llvm {
-class TargetMachine;
-
class XCoreInstPrinter : public MCInstPrinter {
public:
XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 702056d781d0..b00cdd5040eb 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -115,14 +115,14 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
EmitSpecialLLVMGlobal(GV))
return;
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
OutStreamer->SwitchSection(
getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
MCSymbol *GVSym = getSymbol(GV);
const Constant *C = GV->getInitializer();
- unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
-
+ unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+
// Mark the start of the global
getTargetStreamer().emitCCTopData(GVSym->getName());
@@ -154,15 +154,15 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
if (GV->isThreadLocal()) {
report_fatal_error("TLS is not supported by this target!");
}
- unsigned Size = TD->getTypeAllocSize(C->getType());
+ unsigned Size = DL.getTypeAllocSize(C->getType());
if (MAI->hasDotTypeDotSizeDirective()) {
OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym),
MCConstantExpr::create(Size, OutContext));
}
OutStreamer->EmitLabel(GVSym);
-
- EmitGlobalConstant(C);
+
+ EmitGlobalConstant(DL, C);
// The ABI requires that unsigned scalar types smaller than 32 bits
// are padded to 32 bits.
if (Size < 4)
@@ -208,7 +208,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
raw_ostream &O) {
- const DataLayout *DL = TM.getDataLayout();
+ const DataLayout &DL = getDataLayout();
const MachineOperand &MO = MI->getOperand(opNum);
switch (MO.getType()) {
case MachineOperand::MO_Register:
@@ -224,8 +224,8 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
getSymbol(MO.getGlobal())->print(O, MAI);
break;
case MachineOperand::MO_ConstantPoolIndex:
- O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
- << '_' << MO.getIndex();
+ O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
break;
case MachineOperand::MO_BlockAddress:
GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 76c3d8130e75..ae493de083b8 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -160,27 +160,26 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
/// As offsets are negative, the largest offsets will be first.
static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+ const Constant *PersonalityFn,
const TargetLowering *TL) {
assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
- const int* EHSlot = XFI->getEHSpillSlot();
- SpillList.push_back(StackSlotInfo(EHSlot[0],
- MFI->getObjectOffset(EHSlot[0]),
- TL->getExceptionPointerRegister()));
- SpillList.push_back(StackSlotInfo(EHSlot[0],
- MFI->getObjectOffset(EHSlot[1]),
- TL->getExceptionSelectorRegister()));
+ const int *EHSlot = XFI->getEHSpillSlot();
+ SpillList.push_back(
+ StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[0]),
+ TL->getExceptionPointerRegister(PersonalityFn)));
+ SpillList.push_back(
+ StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[1]),
+ TL->getExceptionSelectorRegister(PersonalityFn)));
std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
}
-
static MachineMemOperand *
getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = *MF->getFrameInfo();
- MachineMemOperand *MMO =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
- flags, MFI.getObjectSize(FrameIndex),
- MFI.getObjectAlignment(FrameIndex));
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags,
+ MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex));
return MMO;
}
@@ -323,8 +322,11 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
if (XFI->hasEHSpillSlot()) {
// The unwinder requires stack slot & CFI offsets for the exception info.
// We do not save/spill these registers.
- SmallVector<StackSlotInfo,2> SpillList;
- GetEHSpillList(SpillList, MFI, XFI,
+ const Function *Fn = MF.getFunction();
+ const Constant *PersonalityFn =
+ Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+ SmallVector<StackSlotInfo, 2> SpillList;
+ GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
MF.getSubtarget().getTargetLowering());
assert(SpillList.size()==2 && "Unexpected SpillList size");
EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
@@ -355,8 +357,12 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
if (RetOpcode == XCore::EH_RETURN) {
// 'Restore' the exception info the unwinder has placed into the stack
// slots.
- SmallVector<StackSlotInfo,2> SpillList;
- GetEHSpillList(SpillList, MFI, XFI, MF.getSubtarget().getTargetLowering());
+ const Function *Fn = MF.getFunction();
+ const Constant *PersonalityFn =
+ Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+ SmallVector<StackSlotInfo, 2> SpillList;
+ GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+ MF.getSubtarget().getTargetLowering());
RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
// Return to the landing pad.
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 9d4a966dfba4..9f61c84cd445 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -151,8 +151,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
MVT::Other, CPIdx,
CurDAG->getEntryNode());
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = MF->getMachineMemOperand(
- MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);
+ MemOp[0] =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
return node;
}
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index d62e7428299d..105b2cfb1be6 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -79,9 +79,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget.getRegisterInfo());
- // Division is expensive
- setIntDivIsCheap(false);
-
setStackPointerRegisterToSaveRestore(XCore::SP);
setSchedulingPreference(Sched::Source);
@@ -154,8 +151,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
// Exception handling
setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
- setExceptionPointerRegister(XCore::R0);
- setExceptionSelectorRegister(XCore::R1);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
// Atomic operations
@@ -839,7 +834,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
return DAG.getLoad(
getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN,
- MachinePointerInfo::getFixedStack(FI), false, false, false, 0);
+ MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0);
}
SDValue XCoreTargetLowering::
@@ -1367,8 +1362,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
//from this parameter
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ MachinePointerInfo::getFixedStack(MF, FI), false,
+ false, false, 0);
}
const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
ArgData.push_back(ADP);
@@ -1517,9 +1512,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
// Create a SelectionDAG node corresponding to a store
// to this memory location.
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
- MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN,
- MachinePointerInfo::getFixedStack(FI), false, false,
- 0));
+ MemOpChains.push_back(DAG.getStore(
+ Chain, dl, OutVals[i], FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+ false, 0));
}
// Transform all store nodes into one single node because
@@ -1567,8 +1563,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// to set, the condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
@@ -1828,9 +1823,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
SDValue Chain = ST->getChain();
unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
- if (StoreBits % 8) {
- break;
- }
+ assert((StoreBits % 8) == 0 &&
+ "Store size in bits must be a multiple of 8");
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
unsigned Alignment = ST->getAlignment();
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index ddd675c5164d..b6f09ff418b5 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -125,6 +125,20 @@ namespace llvm {
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
Type *Ty, unsigned AS) const override;
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return XCore::R0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return XCore::R1;
+ }
+
private:
const TargetMachine &TM;
const XCoreSubtarget &Subtarget;
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index ee30344dcc25..e4129aee9479 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -368,11 +368,10 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = *MF->getFrameInfo();
- MachineMemOperand *MMO =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
- MachineMemOperand::MOStore,
- MFI.getObjectSize(FrameIndex),
- MFI.getObjectAlignment(FrameIndex));
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
BuildMI(MBB, I, DL, get(XCore::STWFI))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FrameIndex)
@@ -391,11 +390,10 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
DL = I->getDebugLoc();
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = *MF->getFrameInfo();
- MachineMemOperand *MMO =
- MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
- MachineMemOperand::MOLoad,
- MFI.getObjectSize(FrameIndex),
- MFI.getObjectAlignment(FrameIndex));
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlignment(FrameIndex));
BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
.addFrameIndex(FrameIndex)
.addImm(0)
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 996c6f59346d..f0b720151b17 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -228,12 +228,9 @@ bool XCoreLowerThreadLocal::runOnModule(Module &M) {
// Find thread local globals.
bool MadeChange = false;
SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
- for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
- GVI != E; ++GVI) {
- GlobalVariable *GV = GVI;
- if (GV->isThreadLocal())
- ThreadLocalGlobals.push_back(GV);
- }
+ for (GlobalVariable &GV : M.globals())
+ if (GV.isThreadLocal())
+ ThreadLocalGlobals.push_back(&GV);
for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
}
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index 9ef9752d0a5b..6c770969e32e 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===//
+//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 078ffde18fb9..cdcc52fdc32d 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index f420081868f9..4a79dac0bed9 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -85,7 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
}
TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(XCoreTTIImpl(this, F));
});
}
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index b5a99058f46e..aa16ecc148db 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -123,18 +123,21 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
if (Kind.isMergeableConst16()) return MergeableConst16Section;
}
Type *ObjType = GV->getType()->getPointerElementType();
+ auto &DL = GV->getParent()->getDataLayout();
if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
- TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+ DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
if (Kind.isReadOnly()) return UseCPRel? ReadOnlySection
: DataRelROSection;
if (Kind.isBSS() || Kind.isCommon())return BSSSection;
- if (Kind.isDataRel()) return DataSection;
+ if (Kind.isData())
+ return DataSection;
if (Kind.isReadOnlyWithRel()) return DataRelROSection;
} else {
if (Kind.isReadOnly()) return UseCPRel? ReadOnlySectionLarge
: DataRelROSectionLarge;
if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
- if (Kind.isDataRel()) return DataSectionLarge;
+ if (Kind.isData())
+ return DataSectionLarge;
if (Kind.isReadOnlyWithRel()) return DataRelROSectionLarge;
}
@@ -142,9 +145,8 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
report_fatal_error("Target does not support TLS or Common sections");
}
-MCSection *
-XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind,
- const Constant *C) const {
+MCSection *XCoreTargetObjectFile::getSectionForConstant(
+ const DataLayout &DL, SectionKind Kind, const Constant *C) const {
if (Kind.isMergeableConst4()) return MergeableConst4Section;
if (Kind.isMergeableConst8()) return MergeableConst8Section;
if (Kind.isMergeableConst16()) return MergeableConst16Section;
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 2a5ac238a447..6701c661a73e 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -33,7 +33,7 @@ static const unsigned CodeModelLargeSize = 256;
Mangler &Mang,
const TargetMachine &TM) const override;
- MCSection *getSectionForConstant(SectionKind Kind,
+ MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
const Constant *C) const override;
};
} // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index e23aef3e3b4a..b2cb889f1fc0 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -37,7 +37,7 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
const XCoreTargetLowering *getTLI() const { return TLI; }
public:
- explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F)
+ explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
TLI(ST->getTargetLowering()) {}